diff --git a/README.md b/README.md index c7a5b37..d95fec4 100644 --- a/README.md +++ b/README.md @@ -19,15 +19,25 @@ Linux users, make sure you have g++ >= 4.8. If this is not an option, you should ## Examples ### Simple ```js -require('cld').detect('This is a language recognition example', function(err, result) { +const cld = require('cld'); + +// As a promise +cld.detect('This is a language recognition example').then((result) => { console.log(result); }); + +// In an async function +async function testCld() { + const result = await cld.detect('This is a language recognition example'); + console.log(result); +} ``` ### Advanced ```js -var text = 'Това е пример за разпознаване на Български език'; -var options = { +const cld = require('cld'); +const text = 'Това е пример за разпознаване на Български език'; +const options = { isHTML : false, languageHint : 'BULGARIAN', encodingHint : 'ISO_8859_5', @@ -35,11 +45,27 @@ var options = { httpHint : 'bg' }; -require('cld').detect(text, options, function(err, result) { +// As a promise +cld.detect(text, options).then((result) => { console.log(result); }); + +// In an async function +async function testCld() { + const result = await cld.detect(text, options); + console.log(result); +} ``` +### Legacy +Detect can be called leveraging the node callback pattern. If options are provided, the third parameter should be the callback. +```javascript +const cld = require('cld'); + +cld.detect('This is a language recognition example', (err, result) => { + console.log(result); +}); +``` ## Options diff --git a/index.js b/index.js index 65a8272..0017aa3 100644 --- a/index.js +++ b/index.js @@ -1,76 +1,87 @@ -var _ = require('underscore'); -var cld2 = require('./build/Release/cld'); +const _ = require('underscore'); +const cld2 = require('./build/Release/cld'); module.exports = { LANGUAGES : cld2.LANGUAGES, DETECTED_LANGUAGES : cld2.DETECTED_LANGUAGES, ENCODINGS : cld2.ENCODINGS, - detect : function (text, options, cb) { - if (arguments.length < 2) { - return; - } - if (arguments.length < 3) { + async detect(text, options) { + let cb = arguments[2]; + if (typeof cb !== 'function' && typeof options === 'function') { cb = options; options = {}; } - if (!_.isFunction(cb)) { - return; - } - if (!_.isString(text) || text.length < 1) { - return cb({message:'Empty or invalid text'}); - } + try { + if (arguments.length < 1) { + throw new Error('Not enough arguments provided'); + } - var defaults = { - isHTML : false, - languageHint : '', - encodingHint : '', - tldHint : '', - httpHint : '' - }; - options = _.defaults(options, defaults); + if (!_.isString(text) || text.length < 1) { + throw new Error('Empty or invalid text'); + } - if (!_.isBoolean(options.isHTML)) { - return cb({message:'Invalid isHTML value'}); - } - if (!_.isString(options.languageHint)) { - return cb({message:'Invalid languageHint'}); - } - if (!_.isString(options.encodingHint)) { - return cb({message:'Invalid encodingHint'}); - } - if (!_.isString(options.tldHint)) { - return cb({message:'Invalid tldHint'}); - } - if (!_.isString(options.httpHint)) { - return cb({message:'Invalid httpHint'}); - } - if (options.encodingHint.length > 0 && - !~cld2.ENCODINGS.indexOf(options.encodingHint)) { + const defaults = { + isHTML : false, + languageHint : '', + encodingHint : '', + tldHint : '', + httpHint : '' + }; + options = _.defaults(options, defaults); - return cb({message:'Invalid encodingHint, see ENCODINGS'}); - } - if (options.languageHint.length > 0 && - !~_.keys(cld2.LANGUAGES).indexOf(options.languageHint) && - !~_.values(cld2.LANGUAGES).indexOf(options.languageHint)) { + if (!_.isBoolean(options.isHTML)) { + throw new Error('Invalid isHTML value'); + } + if (!_.isString(options.languageHint)) { + throw new Error('Invalid languageHint'); + } + if (!_.isString(options.encodingHint)) { + throw new Error('Invalid encodingHint'); + } + if (!_.isString(options.tldHint)) { + throw new Error('Invalid tldHint'); + } + if (!_.isString(options.httpHint)) { + throw new Error('Invalid httpHint'); + } + if (options.encodingHint.length > 0 && + !~cld2.ENCODINGS.indexOf(options.encodingHint)) { - return cb({message:'Invalid languageHint, see LANGUAGES'}); - } + throw new Error('Invalid encodingHint, see ENCODINGS'); + } + if (options.languageHint.length > 0 && + !~_.keys(cld2.LANGUAGES).indexOf(options.languageHint) && + !~_.values(cld2.LANGUAGES).indexOf(options.languageHint)) { - var result = cld2.detect( - text, - !options.isHTML, - options.languageHint, - options.encodingHint, - options.tldHint, - options.httpHint - ); + throw new Error('Invalid languageHint, see LANGUAGES'); + } - if (result.languages.length < 1) { - return cb({message:'Failed to identify language'}); - } + const result = await cld2.detectAsync( + text, + !options.isHTML, + options.languageHint, + options.encodingHint, + options.tldHint, + options.httpHint + ); + + if (result.languages.length < 1) { + throw new Error('Failed to identify language'); + } - return cb(null, result); + if (cb) { + return cb(null, result); + } else { + return result; + } + } catch (error) { + if (cb) { + cb(error); + } else { + throw error; + } + } } }; diff --git a/src/cld.cc b/src/cld.cc index 725a165..8c95696 100644 --- a/src/cld.cc +++ b/src/cld.cc @@ -1,3 +1,6 @@ +#include +#include + #include "compact_lang_det.h" #include "encodings.h" #include "constants.h" @@ -9,73 +12,97 @@ using std::unexpected_handler; #include namespace NodeCld { - Napi::Object Detect(const Napi::CallbackInfo& info) { - auto env = info.Env(); - - std::string text = info[0].ToString().Utf8Value(); - const char *bytes = text.c_str(); - int numBytes = text.length(); - bool isPlainText = info[1].ToBoolean(); + struct CLDInput { + std::string bytes, + languageHint, + encodingHint, + tldHint, + httpHint; + int numBytes; + bool isPlainText; + }; + + struct CLDOutput { + CLD2::Language language3[3]; + int percent3[3]; + double normalized_score3[3]; + CLD2::ResultChunkVector resultChunkVector; + int textBytesFound; + bool isReliable; + }; - CLD2::CLDHints hints; - hints.tld_hint = 0; - hints.content_language_hint = 0; - hints.language_hint = CLD2::UNKNOWN_LANGUAGE; - hints.encoding_hint = CLD2::UNKNOWN_ENCODING; + std::unique_ptr UnpackInputFromJSArgs(const Napi::CallbackInfo &info) { + std::unique_ptr input(new CLDInput); + input->bytes = info[0].ToString().Utf8Value(); + input->numBytes = input->bytes.length(); + input->isPlainText = info[1].ToBoolean(); if (info[2].IsString()) { - std::string languageHint = info[2].ToString().Utf8Value(); - if (languageHint.length() > 0) { - hints.language_hint = Constants::getInstance().getLanguageFromName(languageHint.c_str()); - } + input->languageHint = info[2].ToString().Utf8Value(); } if (info[3].IsString()) { - std::string encodingHint = info[3].ToString().Utf8Value(); - if (encodingHint.length() > 0) { - hints.encoding_hint = Constants::getInstance().getEncodingFromName(encodingHint.c_str()); - } + input->encodingHint = info[3].ToString().Utf8Value(); } if (info[4].IsString()) { - std::string tldHint = info[4].ToString().Utf8Value(); - if (tldHint.length() > 0) { - hints.tld_hint = tldHint.c_str(); - } + input->tldHint = info[4].ToString().Utf8Value(); } if (info[5].IsString()) { - std::string httpHint = info[5].ToString().Utf8Value(); - if (httpHint.length() > 0) { - hints.content_language_hint = httpHint.c_str(); - } + input->httpHint = info[5].ToString().Utf8Value(); } - CLD2::Language language3[3]; - int percent3[3]; - double normalized_score3[3]; - CLD2::ResultChunkVector resultChunkVector; - int textBytesFound; - bool isReliable; + return input; + } + + std::unique_ptr DetectLanguage(std::unique_ptr input) { + std::unique_ptr output(new CLDOutput); + CLD2::CLDHints hints; + hints.tld_hint = 0; + hints.content_language_hint = 0; + hints.language_hint = CLD2::UNKNOWN_LANGUAGE; + hints.encoding_hint = CLD2::UNKNOWN_ENCODING; + + if (input->languageHint.length() > 0) { + hints.language_hint = Constants::getInstance().getLanguageFromName(input->languageHint.c_str()); + } + + if (input->encodingHint.length() > 0) { + hints.encoding_hint = Constants::getInstance().getEncodingFromName(input->encodingHint.c_str()); + } + + if (input->tldHint.length() > 0) { + hints.tld_hint = input->tldHint.c_str(); + } + + if (input->httpHint.length() > 0) { + hints.content_language_hint = input->httpHint.c_str(); + } CLD2::ExtDetectLanguageSummary( - bytes, numBytes, - isPlainText, + input->bytes.c_str(), + input->numBytes, + input->isPlainText, &hints, 0, - language3, - percent3, - normalized_score3, - &resultChunkVector, - &textBytesFound, - &isReliable + output->language3, + output->percent3, + output->normalized_score3, + &output->resultChunkVector, + &output->textBytesFound, + &output->isReliable ); + return output; + } + + Napi::Object UnpackOutputToJS(const Napi::Env env, std::unique_ptr output) { size_t languageIdx = 0; auto languages = Napi::Array::New(env); for (size_t resultIdx = 0; resultIdx < 3; resultIdx++) { - CLD2::Language lang = language3[resultIdx]; + CLD2::Language lang = output->language3[resultIdx]; if (lang == CLD2::UNKNOWN_LANGUAGE) { continue; @@ -84,16 +111,16 @@ namespace NodeCld { auto item = Napi::Object::New(env); item["name"] = Napi::String::New(env, Constants::getInstance().getLanguageName(lang)); item["code"] = Napi::String::New(env, Constants::getInstance().getLanguageCode(lang)); - item["percent"] = Napi::Number::New(env, percent3[resultIdx]); - item["score"] = Napi::Number::New(env, normalized_score3[resultIdx]); + item["percent"] = Napi::Number::New(env, output->percent3[resultIdx]); + item["score"] = Napi::Number::New(env, output->normalized_score3[resultIdx]); languages[languageIdx++] = item; } size_t chunkIdx = 0; auto chunks = Napi::Array::New(env); - for (size_t resultIdx = 0; resultIdx < resultChunkVector.size(); resultIdx++) { - CLD2::ResultChunk chunk = resultChunkVector.at(resultIdx); + for (size_t resultIdx = 0; resultIdx < output->resultChunkVector.size(); resultIdx++) { + CLD2::ResultChunk chunk = output->resultChunkVector.at(resultIdx); CLD2::Language lang = static_cast(chunk.lang1); if (lang == CLD2::UNKNOWN_LANGUAGE) { @@ -110,14 +137,51 @@ namespace NodeCld { } auto results = Napi::Object::New(env); - results["reliable"] = Napi::Boolean::New(env, isReliable); - results["textBytes"] = Napi::Number::New(env, textBytesFound); + results["reliable"] = Napi::Boolean::New(env, output->isReliable); + results["textBytes"] = Napi::Number::New(env, output->textBytesFound); results["languages"] = languages; results["chunks"] = chunks; return results; } + class DetectAsyncWorker : public Napi::AsyncWorker { + public: + DetectAsyncWorker(const Napi::CallbackInfo &info): + Napi::AsyncWorker(info.Env()), + deferred(Napi::Promise::Deferred::New(info.Env())), + mInput(UnpackInputFromJSArgs(info)) + {} + + void Execute() { + mOutput = DetectLanguage(std::move(mInput)); + } + + void OnOK() { + deferred.Resolve(UnpackOutputToJS(Env(), std::move(mOutput))); + } + + Napi::Promise Promise() { + this->Queue(); + return deferred.Promise(); + } + + private: + Napi::Promise::Deferred deferred; + std::unique_ptr mInput; + std::unique_ptr mOutput; + }; + + Napi::Object Detect(const Napi::CallbackInfo &info) { + auto input = UnpackInputFromJSArgs(info); + auto output = DetectLanguage(std::move(input)); + return UnpackOutputToJS(info.Env(), std::move(output)); + } + + Napi::Promise DetectAsync(const Napi::CallbackInfo &info) { + return (new DetectAsyncWorker(info))->Promise(); + } + Napi::Object Init(Napi::Env env, Napi::Object exports) { auto rawDetected = Constants::getInstance().getDetected(); auto numDetected = rawDetected->size(); @@ -146,7 +210,7 @@ namespace NodeCld { exports["ENCODINGS"] = encodings; exports["detect"] = Napi::Function::New(env, Detect); - + exports["detectAsync"] = Napi::Function::New(env, DetectAsync); return exports; } diff --git a/test/runner.js b/test/runner.js index 07e25aa..13cab79 100755 --- a/test/runner.js +++ b/test/runner.js @@ -5,63 +5,61 @@ var data = require('./data'); var assert = require('assert'); var _ = require('underscore'); -function runCoreTests(detected) { - _.each(data.all, function(val, key) { +async function runCoreTests(detected) { + for (const val of data.all) { if (!val.testOnWindows) { return; } - cld.detect(val.sample, function(err, result) { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - assert.equal(result.languages.length > 0, true); - assert.equal(val.name, result.languages[0].name); + const result = await cld.detct(val.sample); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + assert.equal(val.name, result.languages[0].name); - detected[val.name] = true; - }); - }); + detected[val.name] = true; + } } -function runChunkTests() { - _.each(data.all, function(val, key) { +async function runChunkTests() { + for (const val of data.all) { if (!val.testOnWindows) { return; } - cld.detect(val.sample, function(err, result) { - assert.equal(result.textBytes > 0, true); - if (val.sample == data.frEnLatn) { - assert.equal(_.isArray(result.chunks), true); - assert.equal(result.chunks.length, 3); + const result = await cld.detect(val.sample); + assert.equal(result.textBytes > 0, true); + if (val.sample == data.frEnLatn) { + assert.equal(_.isArray(result.chunks), true); + assert.equal(result.chunks.length, 3); - var chunkCodes = _.pluck(result.chunks, 'code'); - assert.deepEqual(chunkCodes, ['en', 'fr', 'en']) - } - }); - }); + var chunkCodes = _.pluck(result.chunks, 'code'); + assert.deepEqual(chunkCodes, ['en', 'fr', 'en']) + } + } } -function runEncodingHintTests() { - _.each(data.all, function(item, idx) { +async function runEncodingHintTests() { + for (const item of data.all) { if (!item.testOnWindows) { return; } - _.each(cld.ENCODINGS, function(encoding, idx) { - cld.detect(item.sample, {encodingHint:encoding}, function(err, result) { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - assert.equal(result.languages.length > 0, true); - }); - }); - }); + for (const encodingHint of cld.ENCODINGS) { + const result = await cld.detect(item.sample, { encodingHint }); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + } + } - cld.detect(data.all[0].sample, {encodingHint:'p'}, function(err, result) { + try { + await cld.detect(data.all[0].sample, { encodingHint: 'p' }); + assert.ok(false, 'Should not have detected'); + } catch (err) { assert.equal(err.message, 'Invalid encodingHint, see ENCODINGS'); - }); + } } -function runLanguageHintTests() { +async function runLanguageHintTests() { _.each(data.all, function(item, idx) { if (!item.testOnWindows) { return; @@ -96,52 +94,50 @@ function runLanguageHintTests() { }); } -function runTldHintTests() { - _.each(data.all, function(item, idx) { +async function runTldHintTests() { + for (const item of data.all) { if (!item.testOnWindows) { return; } - cld.detect(item.sample, {tldHint:'edu'}, function(err, result) { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - assert.equal(result.languages.length > 0, true); - }); - cld.detect(item.sample, {tldHint:'com'}, function(err, result) { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - assert.equal(result.languages.length > 0, true); - }); - cld.detect(item.sample, {tldHint:'id'}, function(err, result) { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - assert.equal(result.languages.length > 0, true); - }); - }); + let result = await cld.detect(item.sample, { tldHint: 'edu' }); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + + result = await cld.detect(item.sample, { tldHint: 'com' }); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + + result = await cld.detect(item.sample, { tldHint: 'id' }); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + } } -function runHttpHintTests() { - _.each(data.all, function(item, idx) { +async function runHttpHintTests() { + for (const item of data.all) { if (!item.testOnWindows) { return; } - cld.detect(item.sample, {httpHint:'mi,en'}, function(err, result) { - if (err) { - assert.equal(err.message, 'Failed to identify language'); - } - else { - assert.equal(err, null); - assert.equal(_.isArray(result.languages), true); - } - }); - }); + let result; + try { + result = await cld.detect(item.sample, { httpHint: 'mi,en' }); + } catch (err) { + assert.equal(err.message, 'Failed to identify language'); + return; + } + + assert.equal(_.isArray(result.languages), true); + } } -function runUnreliableTests() { - cld.detect('interaktive infografik \xc3\xbcber videospielkonsolen', function(err, result) { +async function runUnreliableTests() { + try { + await cld.detect('interaktive infografik \xc3\xbcber videospielkonsolen'); + } catch (err) { assert.equal(err.message, 'Failed to identify language'); - }); + } } function runCrossCheckTests(detected) { @@ -153,14 +149,15 @@ function runCrossCheckTests(detected) { assert.equal(_.keys(detected), 0); } - -var detected = {}; - -runCoreTests(detected); -runChunkTests(); -runEncodingHintTests(); -runLanguageHintTests(); -runTldHintTests(); -runHttpHintTests(); -runUnreliableTests(); -runCrossCheckTests(detected); +(async () => { + let detected = {}; + + await runCoreTests(detected); + await runChunkTests(); + await runEncodingHintTests(); + await runLanguageHintTests(); + await runTldHintTests(); + await runHttpHintTests(); + await runUnreliableTests(); + runCrossCheckTests(detected); +})();