From 802964244f1ebae1962aff3ec4c626f5dfa12450 Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Mon, 16 Oct 2017 11:42:49 +1100 Subject: [PATCH 1/2] add sample size option --- index.js | 44 ++++++++++++++++++++++++++++++++++++++------ test/chardet.js | 12 ++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/index.js b/index.js index 8a5153d..e5bcedd 100644 --- a/index.js +++ b/index.js @@ -73,13 +73,45 @@ module.exports.detect = function(buffer) { return match ? match.name : null; }; -module.exports.detectFile = function(filepath, fn) { - fs.readFile(filepath, function(err, res) { - if (err) return fn(err, null); - fn(null, self.detect(res)); - }); +module.exports.detectFile = function(filepath, opts, cb) { + if (typeof opts === 'function') { + cb = opts; + opts = undefined; + } + + var fd; + + var handler = function(err, buffer) { + if (fd) { + fs.closeSync(fd); + } + + if (err) return cb(err, null); + cb(null, self.detect(buffer)); + }; + + if (opts && opts.sampleSize) { + fd = fs.openSync(filepath, 'r'), + sample = new Buffer(opts.sampleSize); + + fs.read(fd, sample, 0, opts.sampleSize, null, function(err) { + handler(err, sample); + }); + return; + } + + fs.readFile(filepath, handler); }; -module.exports.detectFileSync = function(filepath) { +module.exports.detectFileSync = function(filepath, opts) { + if (opts && opts.sampleSize) { + var fd = fs.openSync(filepath, 'r'), + sample = new Buffer(opts.sampleSize); + + fs.readSync(fd, sample, 0, opts.sampleSize); + fs.closeSync(fd); + return self.detect(sample); + } + return self.detect(fs.readFileSync(filepath)); }; diff --git a/test/chardet.js b/test/chardet.js index b73c15b..4b252b0 100644 --- a/test/chardet.js +++ b/test/chardet.js @@ -20,11 +20,23 @@ describe('chardet', function() { done(); }); }); + + it('should detect encoding with smaller sample size', function(done) { + chardet.detectFile(path, { sampleSize: 32 }, function(err, res) { + assert.equal(err, null); + assert.equal(res, 'UTF-8'); + done(); + }); + }); }); describe('#detectFileSync', function() { it('should detect encoding', function() { assert.equal(chardet.detectFileSync(path), 'UTF-8'); }); + + it('should detect encoding with smaller sample size', function() { + assert.equal(chardet.detectFileSync(path, { sampleSize: 32 }), 'UTF-8'); + }); }); }); From 17f788b73fc500d8fb7fcb0ab204cae92b4b9382 Mon Sep 17 00:00:00 2001 From: Dmitry Shirokov Date: Mon, 16 Oct 2017 11:46:01 +1100 Subject: [PATCH 2/2] Update readme --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 496f518..7ca84e3 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,15 @@ chardet.detectFile('/path/to/file', function(err, encoding) {}); chardet.detectFileSync('/path/to/file'); ``` +## Working with large data sets + +Sometimes, when data set is huge and you want to optimize performace (in tradeoff of less accuracy), +you can sample only first N bytes of the buffer: + +```javascript +chardet.detectFile('/path/to/file', { sampleSize: 32 }, function(err, encoding) {}); +``` + ## Supported Encodings: * UTF-8