Skip to content

Commit

Permalink
Add support for Hindi sentence boundary character (।)
Browse files Browse the repository at this point in the history
- Update tokenizer to recognize Devanagari danda (।) as a sentence boundary
- Modify word splitting logic to handle Hindi text
- Add test cases for Hindi sentences
- Update README to mention Hindi sentence boundary support
  • Loading branch information
Jurriaan Roelofs committed Aug 31, 2024
1 parent 6eb7254 commit e9322d9
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 8 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)

Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).

* Split a text based on period, question- and exclamation marks.
* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।).
* Skips (most) abbreviations (Mr., Mrs., PhD.)
* Skips numbers/currency
* Skips urls, websites, email addresses, phone nr.
* Counts ellipsis and ?! as single punctuation

* Supports Hindi sentence boundaries

### Demo

[http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)
Expand Down
11 changes: 7 additions & 4 deletions dist/sbd.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {

if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1)
(i = word.indexOf("?")) > -1 ||
(i = word.indexOf("।")) > -1)
{
var c = word.charAt(i + 1);

// Check if the next word starts with a letter
if (c.match(/[a-zA-Z].*/)) {
if (c.match(/[a-zA-Z\u0900-\u097F].*/)) {
return [word.slice(0, i), word.slice(i+1)];
}
}
Expand All @@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
exports.isBoundaryChar = function(word) {
return word === "." ||
word === "!" ||
word === "?";
word === "?" ||
word === "।";
};

},{}],2:[function(require,module,exports){
Expand Down Expand Up @@ -224,6 +226,7 @@ var newline_placeholder_t = newline_placeholder.trim();
var whiteSpaceCheck = new RegExp("\\S", "");
var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
var splitIntoWords = new RegExp("\\S+|\\n", "g");
var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g");


// Split the entry into sentences.
Expand Down Expand Up @@ -295,7 +298,7 @@ exports.sentences = function(text, user_options) {
}
else {
// - see http://blog.tompawlak.org/split-string-into-tokens-javascript
words = text.trim().match(splitIntoWords);
words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords);
}


Expand Down
4 changes: 2 additions & 2 deletions lib/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ var newline_placeholder_t = newline_placeholder.trim();
var whiteSpaceCheck = new RegExp("\\S", "");
var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
var splitIntoWords = new RegExp("\\S+|\\n", "g");

var splitIntoWordsHindi = new RegExp("\\S+|\\n|।", "g");

// Split the entry into sentences.
exports.sentences = function(text, user_options) {
Expand Down Expand Up @@ -83,7 +83,7 @@ exports.sentences = function(text, user_options) {
}
else {
// - see http://blog.tompawlak.org/split-string-into-tokens-javascript
words = text.trim().match(splitIntoWords);
words = text.trim().match(text.includes('।') ? splitIntoWordsHindi : splitIntoWords);
}


Expand Down
18 changes: 18 additions & 0 deletions test/multiple_sentences.js
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,22 @@ describe('Multiple sentences', function () {
assert.equal(sentences.length, 2);
});
});

describe('Hindi sentence boundary character', function () {
var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।";
var sentences = tokenizer.sentences(entry);

it('should get two sentences', function () {
assert.equal(sentences.length, 2);
});
});

describe('Multiple Hindi sentences', function () {
var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।";
var sentences = tokenizer.sentences(entry);

it('should get three sentences', function () {
assert.equal(sentences.length, 3);
});
});
});

0 comments on commit e9322d9

Please sign in to comment.