diff --git a/src/index.ts b/src/index.ts index ed7ea98..08ada80 100755 --- a/src/index.ts +++ b/src/index.ts @@ -3,9 +3,9 @@ import { Cluster } from "./cluster"; import { Node } from "./node"; import type { SyllableParams } from "./syllable"; import { Syllable } from "./syllable"; -import type { SylOpts } from "./text"; +import type { KetivQere, SylOpts } from "./text"; import { Text } from "./text"; import { Word } from "./word"; export { Char, Cluster, Node, Syllable, Text, Word }; -export type { SyllableParams, SylOpts }; +export type { KetivQere, SyllableParams, SylOpts }; diff --git a/src/text.ts b/src/text.ts index a3cd3a2..8a778aa 100644 --- a/src/text.ts +++ b/src/text.ts @@ -3,10 +3,42 @@ import { Cluster } from "./cluster"; import { Syllable } from "./syllable"; import { holemWaw } from "./utils/holemWaw"; import { convertsQametsQatan } from "./utils/qametsQatan"; -import { splitGroup } from "./utils/regularExpressions"; +import { splitGroup, taamim, taamimCaptureGroup } from "./utils/regularExpressions"; import { sequence } from "./utils/sequence"; import { Word } from "./word"; +export interface KetivQere { + /** + * The word or regex to match on + */ + input: string | RegExp; + /** + * The output of the ketiv qere + * + * @remarks + * When using a callback, the paramerter `text` is the whole text of the word, and `input` is the input of the ketiv qere + */ + output: + | string + /** + * @param text the whole text of the word + * @param input the input of the ketiv qere + */ + | ((text: string, input: KetivQere["input"]) => string); + /** + * Whether to ignore taamin in the target string + * + * @defaultValue true + */ + ignoreTaamim?: boolean; + /** + * Whether to capture taamin from the input and add it to the output + * + * @defaultValue false + */ + captureTaamim?: boolean; +} + /** * Options for determining syllabification that may differ according to reading traditions */ @@ -58,7 +90,8 @@ export interface SylOpts { * * @defaultValue `"preserve"` * - * @example update + * @example + * update * ```ts * const holemHaser = /\u{05BA}/u; * const str = "עָוֹן" // vav + holem @@ -68,7 +101,8 @@ export interface SylOpts { * * ``` * - * @example preserve + * @example + * preserve * ```ts * const holemHaser = /\u{05BA}/u; * const str = "עָוֹן" // vav + holem @@ -77,7 +111,8 @@ export interface SylOpts { * holemHaser.test(newStr); // false * ``` * - * @example remove + * @example + * remove * ```ts * const holemHaser = /\u{05BA}/u; * const str = "עָוֺן" // vav + holem haser @@ -87,6 +122,78 @@ export interface SylOpts { * ``` */ holemHaser?: "update" | "preserve" | "remove"; + /** + * An array of KetivQere objects + * + * @defaultValue `undefined` + * + * @example + * default + * ```ts + * const text = new Text("הִ֑וא", { + * ketivQeres: [ + * { + * input: "הִוא", + * output: "הִיא" + * } + * ] + * }); + * console.log(text.words[0].text); + * // הִיא + * ``` + * + * @example + * `captureTaamim` set to `true` + * ```ts + * const text = new Text("הִ֑וא", { + * ketivQeres: [ + * { + * input: "הִוא", + * output: "הִיא", + * captureTaamim: true + * } + * ] + * }); + * console.log(text.words[0].text); + * // הִ֑יא + * ``` + * + * @example + * `ignoreTaamim` set to `false` + * ```ts + * const text = new Text("הִ֑וא", { + * ketivQeres: [ + * { + * input: "הִ֯וא", + * output: "הִיא", + * ignoreTaamim: false + * } + * ] + * }); + * console.log(text.words[0].text); + * // הִ֯וא + * // does not match because the input taam is not the same as the Text taam + * ``` + * + * @example + * `input` as a regular expression, and `output` as a callback + * ```ts + * const text = new Text("וַיָּבִיאּוּ", { + * ketivQeres: [ + * { + * input: /אּ/, + * output: (word, input) => word.replace(input, "א") + * } + * ] + * }); + * console.log(text.words[0].text); + * // וַיָּבִיאוּ + * ``` + * + * @remarks + * KetivQere objects allow for flexible handling of words, mimicing how ketiv/qeres are used in biblical manuscripts + */ + ketivQeres?: KetivQere[]; /** * Determines whether to regard a sheva after a long vowel (excluding waw-shureq, see {@link wawShureq}) as a _sheva na'_, unless preceded by a meteg (see {@link shevaAfterMeteg}). * @@ -240,6 +347,23 @@ export interface SylOpts { export class Text { #original: string; private options: SylOpts; + /** + * Cache for {@link SylOpts.ketivQeres} + * + * @privateRemarks + * This cache can be improved. Currently, it can only check for exact matches. + * So for example, if you have ketivQere options like this: + * ```js + * new Text("לֹא־נִפְלֵ֥את הִוא֙ מִמְּךָ֔ וְלֹ֥א רְחֹקָ֖ה הִֽוא׃", { + * ketivQeres: [ + * { input: "הִוא", output: "הִוא" }, + * ] + * }) + * ``` + * + * The cache will miss because `הִוא֙` and `הִֽוא׃` are not exact matches, even though `ignoreTaamim` is `true`. + */ + private ketivQereCache: { [k: string]: string } = {}; /** * `Text` requires an input string, @@ -254,6 +378,57 @@ export class Text { this.#original = this.options.allowNoNiqqud ? text : this.validateInput(text); } + private applyKetivQere = (text: string, kq: KetivQere) => { + if (kq.input instanceof RegExp) { + const match = text.match(kq.input); + if (match) { + return typeof kq.output === "string" ? kq.output : kq.output(text, kq.input); + } + } + + if (kq.input === text) { + return typeof kq.output === "string" ? kq.output : kq.output(text, kq.input); + } + + return null; + }; + + private captureTaamim = (text: string) => { + return text.matchAll(Text.taamimCaptureGroup); + }; + + private processKetivQeres = (text: string) => { + if (this.ketivQereCache[text]) { + return this.ketivQereCache[text]; + } + + const ketivQeres = this.options.ketivQeres; + + if (!ketivQeres?.length) { + return text; + } + + for (const ketivQere of ketivQeres) { + const textWithoutTaamim = ketivQere.ignoreTaamim ? this.removeTaamim(text) : text; + + const appliedKetivQere = this.applyKetivQere(textWithoutTaamim, ketivQere); + + if (!appliedKetivQere) { + return text; + } + + const taamimChars = ketivQere.captureTaamim ? this.captureTaamim(text) : null; + + const newText = taamimChars ? this.setTaamim(appliedKetivQere, taamimChars) : appliedKetivQere; + + this.ketivQereCache[text] = newText; + + return newText; + } + + return text; + }; + private validateInput(text: string): string { const niqqud = /[\u{05B0}-\u{05BC}\u{05C7}]/u; if (!niqqud.test(text)) { @@ -262,11 +437,55 @@ export class Text { return text; } + private validateKetivQeres(ketivQeres: SylOpts["ketivQeres"]) { + // if it's undefined, it's fine + if (!ketivQeres) { + return true; + } + + // if there's no ketivQeres, it's fine + if (!ketivQeres.length) { + return true; + } + + // validate the shape of the ketivQeres + for (const [index, ketivQere] of ketivQeres.entries()) { + const { input, output, ignoreTaamim, captureTaamim } = ketivQere; + + if (input === undefined) { + throw new Error(`The ketivQere at index ${index} must have an input`); + } + + if (!(input instanceof RegExp) && typeof input !== "string") { + throw new Error(`The input property of the ketivQere at index ${index} must be a string or RegExp`); + } + + if (output === undefined) { + throw new Error(`The ketivQere at index ${index} must have an output`); + } + + if (typeof output !== "string" && typeof output !== "function") { + throw new Error(`The output property of the ketivQere at index ${index} must be a string or function`); + } + + if (ignoreTaamim && typeof ignoreTaamim !== "boolean") { + throw new Error(`The ignoreTaamim property of the ketivQere at index ${index} must be a boolean`); + } + + if (captureTaamim && typeof captureTaamim !== "boolean") { + throw new Error(`The captureTaamim property of the ketivQere at index ${index} must be a boolean`); + } + } + + return true; + } + private validateOptions(options: SylOpts): SylOpts { const validOpts = [ "allowNoNiqqud", "article", "holemHaser", + "ketivQeres", "longVowels", "qametsQatan", "shevaAfterMeteg", @@ -279,6 +498,10 @@ export class Text { if (!validOpts.includes(k)) { throw new Error(`${k} is not a valid option`); } + if (k === "ketivQeres") { + this.validateKetivQeres(v); + continue; + } if (k === "holemHaser" && !["update", "preserve", "remove"].includes(String(v))) { throw new Error(`The value ${String(v)} is not a valid option for ${k}`); } @@ -289,12 +512,22 @@ export class Text { return options; } + private removeTaamim = (text: string) => { + return text.replace(taamim, ""); + }; + private setOptions(options: SylOpts): SylOpts { const validOpts = this.validateOptions(options); return { allowNoNiqqud: validOpts.allowNoNiqqud ?? false, article: validOpts.article ?? true, holemHaser: validOpts.holemHaser ?? "preserve", + ketivQeres: + validOpts.ketivQeres?.map((kq) => ({ + ...kq, + ignoreTaamim: kq.ignoreTaamim ?? true, + captureTaamim: kq.captureTaamim ?? false + })) ?? [], longVowels: validOpts.longVowels ?? true, qametsQatan: validOpts.qametsQatan ?? true, shevaAfterMeteg: validOpts.shevaAfterMeteg ?? true, @@ -305,6 +538,16 @@ export class Text { }; } + private setTaamim(newText: string, taamimCapture: ReturnType) { + return [...taamimCapture].reduce((text, taamim) => { + return text.slice(0, taamim.index) + taamim[1] + text.slice(taamim.index); + }, newText); + } + + private static get taamimCaptureGroup() { + return taamimCaptureGroup; + } + private get normalized(): string { return this.original.normalize("NFKD"); } @@ -420,7 +663,10 @@ export class Text { get words(): Word[] { const split = this.sanitized.split(splitGroup); const groups = split.filter((group) => group); - const words = groups.map((word) => new Word(word, this.options)); + const words = groups.map((original) => { + const word = this.processKetivQeres(original); + return new Word(word, this.options, word !== original ? original : undefined); + }); const [first, ...rest] = words; first.siblings = rest; diff --git a/src/utils/regularExpressions.ts b/src/utils/regularExpressions.ts index 5d24f57..901e66a 100644 --- a/src/utils/regularExpressions.ts +++ b/src/utils/regularExpressions.ts @@ -126,6 +126,15 @@ export const sheva = /\u{05B0}/u; */ export const taamim = /[\u{0591}-\u{05AE}]/u; +/** + * A regular expression containing all the Hebrew characters of the category ACCENT in a global capture group + * + * ```js + * /([\u{0591}-\u{05AE}])/gu; + * ``` + */ +export const taamimCaptureGroup = /([\u{0591}-\u{05AE}])/gu; + /** * a regular expression containing all the Hebrew point characters (excluding sheva and rafe) * diff --git a/src/word.ts b/src/word.ts index c590fae..ed799c8 100644 --- a/src/word.ts +++ b/src/word.ts @@ -14,6 +14,7 @@ import { syllabify } from "./utils/syllabifier"; */ export class Word extends Node { #text: string; + #original: string; /** * The white space that appears before the word * @@ -104,10 +105,11 @@ export class Word extends Node { return word.split(clusterSplitGroup).map((group) => new Cluster(group)); }; - constructor(text: string, sylOpts: SylOpts) { + constructor(text: string, sylOpts: SylOpts, original?: string) { super(); this.value = this; this.#text = text; + this.#original = original ?? text; const startMatch = text.match(/^\s*/g); const endMatch = text.match(/\s*$/g); this.whiteSpaceBefore = startMatch ? startMatch[0] : null; @@ -337,6 +339,21 @@ export class Word extends Node { return this.text.includes("\u05BE"); } + /** + * The original string passed + * + * @returns the original string passed + * + * @description + * The original string passed to the constructor that has not been checked against any KetivQeres. + * + * @remarks + * The original string passed to the constructor still undergoes the normalization and sequence process, just not checked against any KetivQeres. + */ + get original() { + return this.#original; + } + /** * Gets all the {@link Syllable | Syllables} in the Word * diff --git a/test/options.test.ts b/test/options.test.ts index e6868d3..4848b14 100644 --- a/test/options.test.ts +++ b/test/options.test.ts @@ -27,6 +27,32 @@ describe("validate options", () => { expect(() => new Text("וּלְמַזֵּר", { holemHaser: key })).not.toThrow(); }); }); + + describe("validate ketivQeres", () => { + test("error when passed incorrect input", () => { + // @ts-ignore + expect(() => new Text("וּלְמַזֵּר", { ketivQeres: [{ input: false, output: "bar" }] })).toThrow(); + }); + + test("error when passed incorrect outpout", () => { + // @ts-ignore + expect(() => new Text("וּלְמַזֵּר", { ketivQeres: [{ input: "foo", output: false }] })).toThrow(); + }); + + test("error when passed incorrect ignoreTaamim", () => { + expect( + // @ts-ignore + () => new Text("וּלְמַזֵּר", { ketivQeres: [{ input: "foo", output: "bar", ignoreTaamim: "bob" }] }) + ).toThrow(); + }); + + test("error when passed incorrect captureTaamim", () => { + expect( + // @ts-ignore + () => new Text("וּלְמַזֵּר", { ketivQeres: [{ input: "foo", output: "bar", captureTaamim: "bob" }] }) + ).toThrow(); + }); + }); }); describe.each` @@ -80,6 +106,25 @@ describe.each` }); }); +describe.each` + description | input | options | original | output + ${"3fs qere perpetuum (default)"} | ${"הִ֑וא"} | ${{ input: "הִוא", output: "הִיא" }} | ${"הִ֑וא"} | ${"הִיא"} + ${"3fs qere perpetuum (ignoreTaamim false, no match)"} | ${"הִ֑וא"} | ${{ input: "הִוא", output: "הִיא", ignoreTaamim: false }} | ${"הִ֑וא"} | ${"הִ֑וא"} + ${"3fs qere perpetuum (ignoreTaamim false, match)"} | ${"הִ֑וא"} | ${{ input: "הִ֑וא", output: "הִיא", ignoreTaamim: false }} | ${"הִ֑וא"} | ${"הִיא"} + ${"3fs qere perpetuum (captureTaamim true)"} | ${"הִ֑וא"} | ${{ input: "הִוא", output: "הִיא", captureTaamim: true }} | ${"הִ֑וא"} | ${"הִ֑יא"} + ${"3fs qere perpetuum (captureTaamim true, no taamim)"} | ${"הִוא"} | ${{ input: "הִוא", output: "הִיא", captureTaamim: true }} | ${"הִוא"} | ${"הִיא"} + ${"3fs qere perpetuum (captureTaamim true, ignoreTaamim false, no match)"} | ${"הִוא"} | ${{ input: "הִ֑וא", output: "הִיא", captureTaamim: true, ignoreTaamim: false }} | ${"הִוא"} | ${"הִוא"} + ${"3fs qere perpetuum (captureTaamim true, ignoreTaamim false, match)"} | ${"הִ֑וא"} | ${{ input: "הִ֑וא", output: "הִיא", captureTaamim: true, ignoreTaamim: false }} | ${"הִ֑וא"} | ${"הִ֑יא"} + ${"quiesced alef using input as regex and output as callback"} | ${"וַיָּבִיאּוּ"} | ${{ input: /אּ/, output: (word: string, input: RegExp) => word.replace(input, "א") }} | ${"וַיָּבִיאּוּ"} | ${"וַיָּבִיאוּ"} +`("ketivQeres", ({ description, input, options, original, output }) => { + test(description, () => { + const text = new Text(input, { ketivQeres: [options] }); + const word = text.words[0]; + expect(word.original).toEqual(original); + expect(word.text).toEqual(output); + }); +}); + describe.each` description | word | syllables | isClosedArr | longVowelsOpt | qametsQatanOpt ${"regular qamets"} | ${"יָדְךָ"} | ${["יָ", "דְ", "ךָ"]} | ${[false, false, false]} | ${true} | ${true}