From eec7d853aaa7446fc64d670f62fa6d6d2d738a19 Mon Sep 17 00:00:00 2001
From: Jurriaan Roelofs <jur@dxpr.com>
Date: Sat, 31 Aug 2024 12:40:49 +0200
Subject: [PATCH] =?UTF-8?q?Add=20support=20for=20Hindi=20sentence=20bounda?=
 =?UTF-8?q?ry=20character=20(=E0=A5=A4)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Update tokenizer to recognize Devanagari danda (।) as a sentence boundary
- Modify word splitting logic to handle Hindi text
- Add test cases for Hindi sentences
- Update README to mention Hindi sentence boundary support
---
 README.md                  |  5 +++--
 dist/sbd.js                | 11 +++++++----
 lib/Match.js               |  3 ++-
 lib/tokenizer.js           |  1 -
 test/multiple_sentences.js | 18 ++++++++++++++++++
 5 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b1f432d..4751c60 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)
 
 Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).
 
-* Split a text based on period, question- and exclamation marks.
+* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।).
     * Skips (most) abbreviations (Mr., Mrs., PhD.)
     * Skips numbers/currency
     * Skips urls, websites, email addresses, phone nr.
     * Counts ellipsis and ?! as single punctuation
- 
+    * Supports Hindi sentence boundaries
+
 ### Demo
 
 [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)
diff --git a/dist/sbd.js b/dist/sbd.js
index 65698f4..c5b7497 100644
--- a/dist/sbd.js
+++ b/dist/sbd.js
@@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {
 
     if ((i = word.indexOf(".")) > -1 ||
         (i = word.indexOf("!")) > -1 ||
-        (i = word.indexOf("?")) > -1)
+        (i = word.indexOf("?")) > -1 ||
+        (i = word.indexOf("।")) > -1)
     {
         var c = word.charAt(i + 1);
 
         // Check if the next word starts with a letter
-        if (c.match(/[a-zA-Z].*/)) {
+        if (c.match(/[a-zA-Z\u0900-\u097F].*/)) {
             return [word.slice(0, i), word.slice(i+1)];
         }
     }
@@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "।";
 };
 
 },{}],2:[function(require,module,exports){
@@ -224,6 +226,7 @@ var newline_placeholder_t = newline_placeholder.trim();
 var whiteSpaceCheck = new RegExp("\\S", "");
 var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
 var splitIntoWords = new RegExp("\\S+|\\n", "g");
+var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g");
 
 
 // Split the entry into sentences.
@@ -295,7 +298,7 @@ exports.sentences = function(text, user_options) {
     }
     else {
         // - see http://blog.tompawlak.org/split-string-into-tokens-javascript
-        words = text.trim().match(splitIntoWords);
+        words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords);
     }
 
 
diff --git a/lib/Match.js b/lib/Match.js
index 0172b95..79b688f 100644
--- a/lib/Match.js
+++ b/lib/Match.js
@@ -175,5 +175,6 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "।";
 };
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
index 9f45721..b9344cc 100644
--- a/lib/tokenizer.js
+++ b/lib/tokenizer.js
@@ -13,7 +13,6 @@ var whiteSpaceCheck = new RegExp("\\S", "");
 var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
 var splitIntoWords = new RegExp("\\S+|\\n", "g");
 
-
 // Split the entry into sentences.
 exports.sentences = function(text, user_options) {
     if (!text || typeof text !== "string" || !text.length) {
diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js
index 24b49ed..49a958d 100644
--- a/test/multiple_sentences.js
+++ b/test/multiple_sentences.js
@@ -162,4 +162,22 @@ describe('Multiple sentences', function () {
             assert.equal(sentences.length, 2);
         });
     });
+
+    describe('Hindi sentence boundary character', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get two sentences', function () {
+            assert.equal(sentences.length, 2);
+        });
+    });
+
+    describe('Multiple Hindi sentences', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get three sentences', function () {
+            assert.equal(sentences.length, 3);
+        });
+    });
 });
\ No newline at end of file