From eec7d853aaa7446fc64d670f62fa6d6d2d738a19 Mon Sep 17 00:00:00 2001 From: Jurriaan Roelofs Date: Sat, 31 Aug 2024 12:40:49 +0200 Subject: [PATCH] =?UTF-8?q?Add=20support=20for=20Hindi=20sentence=20bounda?= =?UTF-8?q?ry=20character=20(=E0=A5=A4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update tokenizer to recognize Devanagari danda (।) as a sentence boundary - Modify word splitting logic to handle Hindi text - Add test cases for Hindi sentences - Update README to mention Hindi sentence boundary support --- README.md | 5 +++-- dist/sbd.js | 11 +++++++---- lib/Match.js | 3 ++- lib/tokenizer.js | 1 - test/multiple_sentences.js | 18 ++++++++++++++++++ 5 files changed, 30 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b1f432d..4751c60 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD) Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time). -* Split a text based on period, question- and exclamation marks. +* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।). * Skips (most) abbreviations (Mr., Mrs., PhD.) * Skips numbers/currency * Skips urls, websites, email addresses, phone nr. * Counts ellipsis and ?! as single punctuation - + * Supports Hindi sentence boundaries + ### Demo [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/) diff --git a/dist/sbd.js b/dist/sbd.js index 65698f4..c5b7497 100644 --- a/dist/sbd.js +++ b/dist/sbd.js @@ -160,12 +160,13 @@ exports.isConcatenated = function(word) { if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || - (i = word.indexOf("?")) > -1) + (i = word.indexOf("?")) > -1 || + (i = word.indexOf("।")) > -1) { var c = word.charAt(i + 1); // Check if the next word starts with a letter - if (c.match(/[a-zA-Z].*/)) { + if (c.match(/[a-zA-Z\u0900-\u097F].*/)) { return [word.slice(0, i), word.slice(i+1)]; } } @@ -176,7 +177,8 @@ exports.isConcatenated = function(word) { exports.isBoundaryChar = function(word) { return word === "." || word === "!" || - word === "?"; + word === "?" || + word === "।"; }; },{}],2:[function(require,module,exports){ @@ -224,6 +226,7 @@ var newline_placeholder_t = newline_placeholder.trim(); var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); +var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g"); // Split the entry into sentences. @@ -295,7 +298,7 @@ exports.sentences = function(text, user_options) { } else { // - see http://blog.tompawlak.org/split-string-into-tokens-javascript - words = text.trim().match(splitIntoWords); + words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords); } diff --git a/lib/Match.js b/lib/Match.js index 0172b95..79b688f 100644 --- a/lib/Match.js +++ b/lib/Match.js @@ -175,5 +175,6 @@ exports.isConcatenated = function(word) { exports.isBoundaryChar = function(word) { return word === "." || word === "!" || - word === "?"; + word === "?" || + word === "।"; }; diff --git a/lib/tokenizer.js b/lib/tokenizer.js index 9f45721..b9344cc 100644 --- a/lib/tokenizer.js +++ b/lib/tokenizer.js @@ -13,7 +13,6 @@ var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); - // Split the entry into sentences. exports.sentences = function(text, user_options) { if (!text || typeof text !== "string" || !text.length) { diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js index 24b49ed..49a958d 100644 --- a/test/multiple_sentences.js +++ b/test/multiple_sentences.js @@ -162,4 +162,22 @@ describe('Multiple sentences', function () { assert.equal(sentences.length, 2); }); }); + + describe('Hindi sentence boundary character', function () { + var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।"; + var sentences = tokenizer.sentences(entry); + + it('should get two sentences', function () { + assert.equal(sentences.length, 2); + }); + }); + + describe('Multiple Hindi sentences', function () { + var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।"; + var sentences = tokenizer.sentences(entry); + + it('should get three sentences', function () { + assert.equal(sentences.length, 3); + }); + }); }); \ No newline at end of file