From 6c97852799d409f7125010a1c4e5ba9d0295565f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Souli=C3=A9?= Date: Fri, 23 Aug 2019 14:17:18 +0200 Subject: [PATCH] Introduce splitRegex option. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue: Using /\b/ to split the text is limited to languages using only the 63 characters: ``` a b c d e f g h i j k l m n o p q r s t u v w x y z A B C D E F G H I J K L M N O P Q R S T U V W X Y Z 0 1 2 3 4 5 6 7 8 9 _ ``` cf https://stackoverflow.com/a/2449892 For example, in French, the string `je suis français` will be split into `["je", " ", "suis", " ", "fran", "ç", "ais"]` which won't allow to perform the bad-words cleaning. Therefore I added an option `splitRegex` which allow to overwrite the regex used to split. --- lib/badwords.js | 6 ++++-- test/options.js | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 test/options.js diff --git a/lib/badwords.js b/lib/badwords.js index 9091708..3990c41 100644 --- a/lib/badwords.js +++ b/lib/badwords.js @@ -12,11 +12,13 @@ class Filter { * @param {string} options.placeHolder - Character used to replace profane words. * @param {string} options.regex - Regular expression used to sanitize words before comparing them to blacklist. * @param {string} options.replaceRegex - Regular expression used to replace profane words with placeHolder. + * @param {string} options.splitRegex - Regular expression used to split a string into words. */ constructor(options = {}) { Object.assign(this, { list: options.emptyList && [] || Array.prototype.concat.apply(localList, [baseList, options.list || []]), exclude: options.exclude || [], + splitRegex: options.splitRegex || /\b/, placeHolder: options.placeHolder || '*', regex: options.regex || /[^a-zA-Z0-9|\$|\@]|\^/g, replaceRegex: options.replaceRegex || /\w/g @@ -51,9 +53,9 @@ class Filter { * @param {string} string - Sentence to filter. */ clean(string) { - return string.split(/\b/).map((word) => { + return string.split(this.splitRegex).map((word) => { return this.isProfane(word) ? this.replaceWord(word) : word; - }).join(''); + }).join(this.splitRegex.exec(string)[0]); } /** diff --git a/test/options.js b/test/options.js new file mode 100644 index 0000000..cada98f --- /dev/null +++ b/test/options.js @@ -0,0 +1,24 @@ +require('assert'); +var Filter = require('../lib/badwords.js'), +assert = require('better-assert'); + +describe('options', function() { + describe('split regex', function() { + + it('default value', function() { + filter = new Filter(); + filter.addWords('français'); + assert(filter.clean('fucking asshole') == '******* *******'); + assert(filter.clean('mot en français') == 'mot en français'); + }); + + it('override value', function() { + filter = new Filter({splitRegex: / /}); + filter.addWords('français'); + assert(filter.clean('fucking asshole') == '******* *******'); + assert(filter.clean('mot en français') == 'mot en *******'); + }); + + + }); +});