diff --git a/README.md b/README.md index b1f432d..4751c60 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD) Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time). -* Split a text based on period, question- and exclamation marks. +* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।). * Skips (most) abbreviations (Mr., Mrs., PhD.) * Skips numbers/currency * Skips urls, websites, email addresses, phone nr. * Counts ellipsis and ?! as single punctuation - + * Supports Hindi sentence boundaries + ### Demo [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/) diff --git a/dist/sbd.js b/dist/sbd.js index 65698f4..c5b7497 100644 --- a/dist/sbd.js +++ b/dist/sbd.js @@ -160,12 +160,13 @@ exports.isConcatenated = function(word) { if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || - (i = word.indexOf("?")) > -1) + (i = word.indexOf("?")) > -1 || + (i = word.indexOf("।")) > -1) { var c = word.charAt(i + 1); // Check if the next word starts with a letter - if (c.match(/[a-zA-Z].*/)) { + if (c.match(/[a-zA-Z\u0900-\u097F].*/)) { return [word.slice(0, i), word.slice(i+1)]; } } @@ -176,7 +177,8 @@ exports.isConcatenated = function(word) { exports.isBoundaryChar = function(word) { return word === "." || word === "!" || - word === "?"; + word === "?" || + word === "।"; }; },{}],2:[function(require,module,exports){ @@ -224,6 +226,7 @@ var newline_placeholder_t = newline_placeholder.trim(); var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); +var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g"); // Split the entry into sentences. @@ -295,7 +298,7 @@ exports.sentences = function(text, user_options) { } else { // - see http://blog.tompawlak.org/split-string-into-tokens-javascript - words = text.trim().match(splitIntoWords); + words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords); } diff --git a/lib/tokenizer.js b/lib/tokenizer.js index 9f45721..080973f 100644 --- a/lib/tokenizer.js +++ b/lib/tokenizer.js @@ -12,7 +12,7 @@ var newline_placeholder_t = newline_placeholder.trim(); var whiteSpaceCheck = new RegExp("\\S", ""); var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g"); var splitIntoWords = new RegExp("\\S+|\\n", "g"); - +var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g"); // Split the entry into sentences. exports.sentences = function(text, user_options) { @@ -83,7 +83,7 @@ exports.sentences = function(text, user_options) { } else { // - see http://blog.tompawlak.org/split-string-into-tokens-javascript - words = text.trim().match(splitIntoWords); + words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords); } diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js index 24b49ed..49a958d 100644 --- a/test/multiple_sentences.js +++ b/test/multiple_sentences.js @@ -162,4 +162,22 @@ describe('Multiple sentences', function () { assert.equal(sentences.length, 2); }); }); + + describe('Hindi sentence boundary character', function () { + var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।"; + var sentences = tokenizer.sentences(entry); + + it('should get two sentences', function () { + assert.equal(sentences.length, 2); + }); + }); + + describe('Multiple Hindi sentences', function () { + var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।"; + var sentences = tokenizer.sentences(entry); + + it('should get three sentences', function () { + assert.equal(sentences.length, 3); + }); + }); }); \ No newline at end of file