Add support for Hindi sentence boundary character (।)

- Update tokenizer to recognize Devanagari danda (।) as a sentence boundary - Modify word splitting logic to handle Hindi text - Add test cases for Hindi sentences - Update README to mention Hindi sentence boundary support
Tessmore · Aug 31, 2024 · e9322d9 · e9322d9
1 parent 6eb7254
commit e9322d9
Show file tree

Hide file tree

Showing 4 changed files with 30 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)
 
 Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).
 
-* Split a text based on period, question- and exclamation marks.
+* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।).
     * Skips (most) abbreviations (Mr., Mrs., PhD.)
     * Skips numbers/currency
     * Skips urls, websites, email addresses, phone nr.
     * Counts ellipsis and ?! as single punctuation
-
+    * Supports Hindi sentence boundaries
+
 ### Demo
 
 [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)

diff --git a/dist/sbd.js b/dist/sbd.js
@@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {
 
     if ((i = word.indexOf(".")) > -1 ||
         (i = word.indexOf("!")) > -1 ||
-        (i = word.indexOf("?")) > -1)
+        (i = word.indexOf("?")) > -1 ||
+        (i = word.indexOf("।")) > -1)
     {
         var c = word.charAt(i + 1);
 
         // Check if the next word starts with a letter
-        if (c.match(/[a-zA-Z].*/)) {
+        if (c.match(/[a-zA-Z\u0900-\u097F].*/)) {
             return [word.slice(0, i), word.slice(i+1)];
         }
     }
@@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "।";
 };
 
 },{}],2:[function(require,module,exports){
@@ -224,6 +226,7 @@ var newline_placeholder_t = newline_placeholder.trim();
 var whiteSpaceCheck = new RegExp("\\S", "");
 var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
 var splitIntoWords = new RegExp("\\S+|\\n", "g");
+var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g");
 
 
 // Split the entry into sentences.
@@ -295,7 +298,7 @@ exports.sentences = function(text, user_options) {
     }
     else {
         // - see http://blog.tompawlak.org/split-string-into-tokens-javascript
-        words = text.trim().match(splitIntoWords);
+        words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords);
     }
 
 

diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -12,7 +12,7 @@ var newline_placeholder_t = newline_placeholder.trim();
 var whiteSpaceCheck = new RegExp("\\S", "");
 var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
 var splitIntoWords = new RegExp("\\S+|\\n", "g");
-
+var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g");
 
 // Split the entry into sentences.
 exports.sentences = function(text, user_options) {
@@ -83,7 +83,7 @@ exports.sentences = function(text, user_options) {
     }
     else {
         // - see http://blog.tompawlak.org/split-string-into-tokens-javascript
-        words = text.trim().match(splitIntoWords);
+        words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords);
     }
 
 

diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js
@@ -162,4 +162,22 @@ describe('Multiple sentences', function () {
             assert.equal(sentences.length, 2);
         });
     });
+
+    describe('Hindi sentence boundary character', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get two sentences', function () {
+            assert.equal(sentences.length, 2);
+        });
+    });
+
+    describe('Multiple Hindi sentences', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get three sentences', function () {
+            assert.equal(sentences.length, 3);
+        });
+    });
 });