Skip to content

Commit

Permalink
Merge pull request #6 from runk/optimize-file-read
Browse files Browse the repository at this point in the history
Optimize file read
  • Loading branch information
runk authored Oct 16, 2017
2 parents 15a2547 + 17f788b commit 320e389
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 6 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,15 @@ chardet.detectFile('/path/to/file', function(err, encoding) {});
chardet.detectFileSync('/path/to/file');
```

## Working with large data sets

Sometimes, when data set is huge and you want to optimize performace (in tradeoff of less accuracy),
you can sample only first N bytes of the buffer:

```javascript
chardet.detectFile('/path/to/file', { sampleSize: 32 }, function(err, encoding) {});
```

## Supported Encodings:

* UTF-8
Expand Down
44 changes: 38 additions & 6 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,45 @@ module.exports.detect = function(buffer) {
return match ? match.name : null;
};

module.exports.detectFile = function(filepath, fn) {
fs.readFile(filepath, function(err, res) {
if (err) return fn(err, null);
fn(null, self.detect(res));
});
module.exports.detectFile = function(filepath, opts, cb) {
if (typeof opts === 'function') {
cb = opts;
opts = undefined;
}

var fd;

var handler = function(err, buffer) {
if (fd) {
fs.closeSync(fd);
}

if (err) return cb(err, null);
cb(null, self.detect(buffer));
};

if (opts && opts.sampleSize) {
fd = fs.openSync(filepath, 'r'),
sample = new Buffer(opts.sampleSize);

fs.read(fd, sample, 0, opts.sampleSize, null, function(err) {
handler(err, sample);
});
return;
}

fs.readFile(filepath, handler);
};

module.exports.detectFileSync = function(filepath) {
module.exports.detectFileSync = function(filepath, opts) {
if (opts && opts.sampleSize) {
var fd = fs.openSync(filepath, 'r'),
sample = new Buffer(opts.sampleSize);

fs.readSync(fd, sample, 0, opts.sampleSize);
fs.closeSync(fd);
return self.detect(sample);
}

return self.detect(fs.readFileSync(filepath));
};
12 changes: 12 additions & 0 deletions test/chardet.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,23 @@ describe('chardet', function() {
done();
});
});

it('should detect encoding with smaller sample size', function(done) {
chardet.detectFile(path, { sampleSize: 32 }, function(err, res) {
assert.equal(err, null);
assert.equal(res, 'UTF-8');
done();
});
});
});

describe('#detectFileSync', function() {
it('should detect encoding', function() {
assert.equal(chardet.detectFileSync(path), 'UTF-8');
});

it('should detect encoding with smaller sample size', function() {
assert.equal(chardet.detectFileSync(path, { sampleSize: 32 }), 'UTF-8');
});
});
});

0 comments on commit 320e389

Please sign in to comment.