Skip to content

Commit

Permalink
Add ketiv qere feature
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesLoder committed Sep 10, 2024
1 parent 2e67a8e commit 92165da
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ import { Cluster } from "./cluster";
import { Node } from "./node";
import type { SyllableParams } from "./syllable";
import { Syllable } from "./syllable";
import type { SylOpts } from "./text";
import type { KetivQere, SylOpts } from "./text";
import { Text } from "./text";
import { Word } from "./word";

export { Char, Cluster, Node, Syllable, Text, Word };
export type { SyllableParams, SylOpts };
export type { KetivQere, SyllableParams, SylOpts };
256 changes: 251 additions & 5 deletions src/text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,42 @@ import { Cluster } from "./cluster";
import { Syllable } from "./syllable";
import { holemWaw } from "./utils/holemWaw";
import { convertsQametsQatan } from "./utils/qametsQatan";
import { splitGroup } from "./utils/regularExpressions";
import { splitGroup, taamim, taamimCaptureGroup } from "./utils/regularExpressions";
import { sequence } from "./utils/sequence";
import { Word } from "./word";

export interface KetivQere {
/**
* The word or regex to match on
*/
input: string | RegExp;
/**
* The output of the ketiv qere
*
* @remarks
* When using a callback, the paramerter `text` is the whole text of the word, and `input` is the input of the ketiv qere
*/
output:
| string
/**
* @param text the whole text of the word
* @param input the input of the ketiv qere
*/
| ((text: string, input: KetivQere["input"]) => string);
/**
* Whether to ignore taamin in the target string
*
* @defaultValue true
*/
ignoreTaamim?: boolean;
/**
* Whether to capture taamin from the input and add it to the output
*
* @defaultValue false
*/
captureTaamim?: boolean;
}

/**
* Options for determining syllabification that may differ according to reading traditions
*/
Expand Down Expand Up @@ -58,7 +90,8 @@ export interface SylOpts {
*
* @defaultValue `"preserve"`
*
* @example update
* @example
* update
* ```ts
* const holemHaser = /\u{05BA}/u;
* const str = "עָוֹן" // vav + holem
Expand All @@ -68,7 +101,8 @@ export interface SylOpts {
*
* ```
*
* @example preserve
* @example
* preserve
* ```ts
* const holemHaser = /\u{05BA}/u;
* const str = "עָוֹן" // vav + holem
Expand All @@ -77,7 +111,8 @@ export interface SylOpts {
* holemHaser.test(newStr); // false
* ```
*
* @example remove
* @example
* remove
* ```ts
* const holemHaser = /\u{05BA}/u;
* const str = "עָוֺן" // vav + holem haser
Expand All @@ -87,6 +122,78 @@ export interface SylOpts {
* ```
*/
holemHaser?: "update" | "preserve" | "remove";
/**
* An array of KetivQere objects
*
* @defaultValue `undefined`
*
* @example
* default
* ```ts
* const text = new Text("הִ֑וא", {
* ketivQeres: [
* {
* input: "הִוא",
* output: "הִיא"
* }
* ]
* });
* console.log(text.words[0].text);
* // הִיא
* ```
*
* @example
* `captureTaamim` set to `true`
* ```ts
* const text = new Text("הִ֑וא", {
* ketivQeres: [
* {
* input: "הִוא",
* output: "הִיא",
* captureTaamim: true
* }
* ]
* });
* console.log(text.words[0].text);
* // הִ֑יא
* ```
*
* @example
* `ignoreTaamim` set to `false`
* ```ts
* const text = new Text("הִ֑וא", {
* ketivQeres: [
* {
* input: "הִ֯וא",
* output: "הִיא",
* ignoreTaamim: false
* }
* ]
* });
* console.log(text.words[0].text);
* // הִ֯וא
* // does not match because the input taam is not the same as the Text taam
* ```
*
* @example
* `input` as a regular expression, and `output` as a callback
* ```ts
* const text = new Text("וַיָּבִיאּוּ", {
* ketivQeres: [
* {
* input: /אּ/,
* output: (word, input) => word.replace(input, "א")
* }
* ]
* });
* console.log(text.words[0].text);
* // וַיָּבִיאוּ
* ```
*
* @remarks
* KetivQere objects allow for flexible handling of words, mimicing how ketiv/qeres are used in biblical manuscripts
*/
ketivQeres?: KetivQere[];
/**
* Determines whether to regard a sheva after a long vowel (excluding waw-shureq, see {@link wawShureq}) as a _sheva na'_, unless preceded by a meteg (see {@link shevaAfterMeteg}).
*
Expand Down Expand Up @@ -240,6 +347,23 @@ export interface SylOpts {
export class Text {
#original: string;
private options: SylOpts;
/**
* Cache for {@link SylOpts.ketivQeres}
*
* @privateRemarks
* This cache can be improved. Currently, it can only check for exact matches.
* So for example, if you have ketivQere options like this:
* ```js
* new Text("לֹא־נִפְלֵ֥את הִוא֙ מִמְּךָ֔ וְלֹ֥א רְחֹקָ֖ה הִֽוא׃", {
* ketivQeres: [
* { input: "הִוא", output: "הִוא" },
* ]
* })
* ```
*
* The cache will miss because `הִוא֙` and `הִֽוא׃` are not exact matches, even though `ignoreTaamim` is `true`.
*/
private ketivQereCache: { [k: string]: string } = {};

/**
* `Text` requires an input string,
Expand All @@ -254,6 +378,57 @@ export class Text {
this.#original = this.options.allowNoNiqqud ? text : this.validateInput(text);
}

private applyKetivQere = (text: string, kq: KetivQere) => {
if (kq.input instanceof RegExp) {
const match = text.match(kq.input);
if (match) {
return typeof kq.output === "string" ? kq.output : kq.output(text, kq.input);
}
}

if (kq.input === text) {
return typeof kq.output === "string" ? kq.output : kq.output(text, kq.input);
}

return null;
};

private captureTaamim = (text: string) => {
return text.matchAll(Text.taamimCaptureGroup);
};

private processKetivQeres = (text: string) => {
if (this.ketivQereCache[text]) {
return this.ketivQereCache[text];
}

const ketivQeres = this.options.ketivQeres;

if (!ketivQeres?.length) {
return text;
}

for (const ketivQere of ketivQeres) {
const textWithoutTaamim = ketivQere.ignoreTaamim ? this.removeTaamim(text) : text;

const appliedKetivQere = this.applyKetivQere(textWithoutTaamim, ketivQere);

if (!appliedKetivQere) {
return text;
}

const taamimChars = ketivQere.captureTaamim ? this.captureTaamim(text) : null;

const newText = taamimChars ? this.setTaamim(appliedKetivQere, taamimChars) : appliedKetivQere;

this.ketivQereCache[text] = newText;

return newText;
}

return text;
};

private validateInput(text: string): string {
const niqqud = /[\u{05B0}-\u{05BC}\u{05C7}]/u;
if (!niqqud.test(text)) {
Expand All @@ -262,11 +437,55 @@ export class Text {
return text;
}

private validateKetivQeres(ketivQeres: SylOpts["ketivQeres"]) {
// if it's undefined, it's fine
if (!ketivQeres) {
return true;
}

// if there's no ketivQeres, it's fine
if (!ketivQeres.length) {
return true;
}

// validate the shape of the ketivQeres
for (const [index, ketivQere] of ketivQeres.entries()) {
const { input, output, ignoreTaamim, captureTaamim } = ketivQere;

if (input === undefined) {
throw new Error(`The ketivQere at index ${index} must have an input`);
}

if (!(input instanceof RegExp) && typeof input !== "string") {
throw new Error(`The input property of the ketivQere at index ${index} must be a string or RegExp`);
}

if (output === undefined) {
throw new Error(`The ketivQere at index ${index} must have an output`);
}

if (typeof output !== "string" && typeof output !== "function") {
throw new Error(`The output property of the ketivQere at index ${index} must be a string or function`);
}

if (ignoreTaamim && typeof ignoreTaamim !== "boolean") {
throw new Error(`The ignoreTaamim property of the ketivQere at index ${index} must be a boolean`);
}

if (captureTaamim && typeof captureTaamim !== "boolean") {
throw new Error(`The captureTaamim property of the ketivQere at index ${index} must be a boolean`);
}
}

return true;
}

private validateOptions(options: SylOpts): SylOpts {
const validOpts = [
"allowNoNiqqud",
"article",
"holemHaser",
"ketivQeres",
"longVowels",
"qametsQatan",
"shevaAfterMeteg",
Expand All @@ -279,6 +498,10 @@ export class Text {
if (!validOpts.includes(k)) {
throw new Error(`${k} is not a valid option`);
}
if (k === "ketivQeres") {
this.validateKetivQeres(v);

Check failure on line 502 in src/text.ts

View workflow job for this annotation

GitHub Actions / test

Unsafe argument of type `any` assigned to a parameter of type `KetivQere[] | undefined`
continue;
}
if (k === "holemHaser" && !["update", "preserve", "remove"].includes(String(v))) {
throw new Error(`The value ${String(v)} is not a valid option for ${k}`);
}
Expand All @@ -289,12 +512,22 @@ export class Text {
return options;
}

private removeTaamim = (text: string) => {
return text.replace(taamim, "");
};

private setOptions(options: SylOpts): SylOpts {
const validOpts = this.validateOptions(options);
return {
allowNoNiqqud: validOpts.allowNoNiqqud ?? false,
article: validOpts.article ?? true,
holemHaser: validOpts.holemHaser ?? "preserve",
ketivQeres:
validOpts.ketivQeres?.map((kq) => ({
...kq,
ignoreTaamim: kq.ignoreTaamim ?? true,
captureTaamim: kq.captureTaamim ?? false

Check failure on line 529 in src/text.ts

View workflow job for this annotation

GitHub Actions / test

Expected object keys to be in ascending order. 'captureTaamim' should be before 'ignoreTaamim'
})) ?? [],
longVowels: validOpts.longVowels ?? true,
qametsQatan: validOpts.qametsQatan ?? true,
shevaAfterMeteg: validOpts.shevaAfterMeteg ?? true,
Expand All @@ -305,6 +538,16 @@ export class Text {
};
}

private setTaamim(newText: string, taamimCapture: ReturnType<Text["captureTaamim"]>) {
return [...taamimCapture].reduce((text, taamim) => {

Check failure on line 542 in src/text.ts

View workflow job for this annotation

GitHub Actions / test

'taamim' is already declared in the upper scope on line 6 column 22

Check failure on line 542 in src/text.ts

View workflow job for this annotation

GitHub Actions / test

'taamim' is already declared in the upper scope on line 6 column 22
return text.slice(0, taamim.index) + taamim[1] + text.slice(taamim.index);
}, newText);
}

private static get taamimCaptureGroup() {
return taamimCaptureGroup;
}

private get normalized(): string {
return this.original.normalize("NFKD");
}
Expand Down Expand Up @@ -420,7 +663,10 @@ export class Text {
get words(): Word[] {
const split = this.sanitized.split(splitGroup);
const groups = split.filter((group) => group);
const words = groups.map((word) => new Word(word, this.options));
const words = groups.map((original) => {
const word = this.processKetivQeres(original);
return new Word(word, this.options, word !== original ? original : undefined);
});
const [first, ...rest] = words;
first.siblings = rest;

Expand Down
9 changes: 9 additions & 0 deletions src/utils/regularExpressions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,15 @@ export const sheva = /\u{05B0}/u;
*/
export const taamim = /[\u{0591}-\u{05AE}]/u;

/**
* A regular expression containing all the Hebrew characters of the category ACCENT in a global capture group
*
* ```js
* /([\u{0591}-\u{05AE}])/gu;
* ```
*/
export const taamimCaptureGroup = /([\u{0591}-\u{05AE}])/gu;

/**
* a regular expression containing all the Hebrew point characters (excluding sheva and rafe)
*
Expand Down
Loading

0 comments on commit 92165da

Please sign in to comment.