From ba88b89bdd2c281b5bd1e4ab6742bfd82941e322 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 27 Aug 2024 15:06:04 -0400 Subject: [PATCH] support the bestEffort flag --- README.md | 5 +++++ index.d.ts | 1 + index.js | 9 +++++++-- src/cld.cc | 8 +++++++- test/runner.js | 17 ++++++++++++++++- 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5d30760..14b34d8 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,11 @@ Pass top level domain as a hint Pass an HTTP "Content-Encoding" value as a hint +#### bestEffort + +Set to true to give best-effort answer, instead of UNKNOWN_LANGUAGE. May be useful for +short text if the caller prefers an approximate answer over none. + ## Warning Once the module has been installed, the underlying C sources will remain in the ```deps/cld``` folder and continue to occupy considerable space. This is because they will be required if you ever need to run `npm rebuild`. If you are under severe constraints you can delete this folder and reclam >100M diff --git a/index.d.ts b/index.d.ts index 1333b03..01dc991 100644 --- a/index.d.ts +++ b/index.d.ts @@ -16,6 +16,7 @@ interface Options { readonly encodingHint?: string; readonly tldHint?: string; readonly httpHint?: string; + readonly bestEffort?: boolean; } interface DetectLanguage { readonly reliable: boolean; diff --git a/index.js b/index.js index b59d40b..72109a8 100644 --- a/index.js +++ b/index.js @@ -27,13 +27,17 @@ module.exports = { languageHint : '', encodingHint : '', tldHint : '', - httpHint : '' + httpHint : '', + bestEffort : false }; options = _.defaults({}, options, defaults); if (!_.isBoolean(options.isHTML)) { throw new Error('Invalid isHTML value'); } + if (!_.isBoolean(options.bestEffort)) { + throw new Error('Invalid bestEffort value'); + } if (!_.isString(options.languageHint)) { throw new Error('Invalid languageHint'); } @@ -64,7 +68,8 @@ module.exports = { options.languageHint, options.encodingHint, options.tldHint, - options.httpHint + options.httpHint, + options.bestEffort ); if (result.languages.length < 1) { diff --git a/src/cld.cc b/src/cld.cc index 627e07e..fc6d343 100644 --- a/src/cld.cc +++ b/src/cld.cc @@ -19,6 +19,7 @@ namespace NodeCld { httpHint; int numBytes; bool isPlainText; + bool bestEffort; }; struct CLDOutput { @@ -52,6 +53,7 @@ namespace NodeCld { if (info[5].IsString()) { input->httpHint = info[5].ToString().Utf8Value(); } + input->bestEffort = info[6].ToBoolean(); return input; } @@ -79,13 +81,17 @@ namespace NodeCld { if (input->httpHint.length() > 0) { hints.content_language_hint = input->httpHint.c_str(); } + int flags = 0; + if (input->bestEffort) { + flags |= CLD2::kCLDFlagBestEffort; + } CLD2::ExtDetectLanguageSummary( input->bytes.c_str(), input->numBytes, input->isPlainText, &hints, - 0, + flags, output->language3, output->percent3, output->normalized_score3, diff --git a/test/runner.js b/test/runner.js index 1fab928..d5d3036 100755 --- a/test/runner.js +++ b/test/runner.js @@ -11,7 +11,7 @@ async function runCoreTests(detected) { return; } - const result = await cld.detct(val.sample); + const result = await cld.detect(val.sample); assert.equal(_.isArray(result.languages), true); assert.equal(result.languages.length > 0, true); assert.equal(val.name, result.languages[0].name); @@ -20,6 +20,20 @@ async function runCoreTests(detected) { } } +async function runBestEffortTests() { + for (const val of data.all) { + if (!val.testOnWindows) { + return; + } + + const result = await cld.detect(val.sample, {bestEffort: true}); + assert.equal(_.isArray(result.languages), true); + assert.equal(result.languages.length > 0, true); + assert.equal(val.name, result.languages[0].name); + } +} + + async function runChunkTests() { for (const val of data.all) { if (!val.testOnWindows) { @@ -165,6 +179,7 @@ function runCrossCheckTests(detected) { let detected = {}; await runCoreTests(detected); + await runBestEffortTests(); await runChunkTests(); await runEncodingHintTests(); await runLanguageHintTests();