Skip to content

Commit

Permalink
Fallback to Google when Fasttext is uncertain
Browse files Browse the repository at this point in the history
  • Loading branch information
computermacgyver committed Sep 25, 2024
1 parent 30bde9b commit b2ed0b7
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 6 deletions.
2 changes: 1 addition & 1 deletion app/main/lib/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
from app.main.lib.error_log import ErrorLog
from app.main.lib.langid import FastTextLangidProvider as LangidProvider
from app.main.lib.langid import HybridLangidProvider as LangidProvider

def get_all_documents_matching_context(context):
matches, clause_count = generate_matches(context)
Expand Down
17 changes: 16 additions & 1 deletion app/main/lib/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests
import cld3
import fasttext
import json

from app.main.lib.google_client import get_credentialed_google_client

Expand Down Expand Up @@ -98,4 +99,18 @@ def langid(text):
def test():
FastTextLangidProvider.fasttext_model.get_language("Some text to check")
return True


class HybridLangidProvider:
@staticmethod
def langid(text):
fasttext = FastTextLangidProvider.langid(text)
if fasttext['result']['confience']<0.8:
google = GoogleLangidProvider.langid(text)
app.logger.debug(json.dumps({
'service':'LangId',
'message': 'Called Google after inconclusive FastText results',
'parameters':{'text':text, 'fasttext':fasttext, 'google':google},
}))
return google
else:
return fasttext
2 changes: 1 addition & 1 deletion app/main/lib/text_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from app.main.lib import elastic_crud
from app.main.lib.shared_models.shared_model import SharedModel
from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
from app.main.lib.langid import FastTextLangidProvider as LangidProvider
from app.main.lib.langid import HybridLangidProvider as LangidProvider
from app.main.lib.openai import retrieve_openai_embeddings, PREFIX_OPENAI
ELASTICSEARCH_DEFAULT_LIMIT = 10000
def delete_text(doc_id, context, quiet):
Expand Down
15 changes: 12 additions & 3 deletions app/test/test_langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from app.main import db
from app.test.base import BaseTestCase
from app.main.lib.langid import GoogleLangidProvider, Cld3LangidProvider, FastTextLangidProvider
from app.main.lib.langid import GoogleLangidProvider, Cld3LangidProvider, FastTextLangidProvider, HybridLangidProvider
from app.main.controller.langid_controller import LangidResource
from app.main.lib import redis_client

Expand Down Expand Up @@ -51,10 +51,19 @@ def test_cleanup_result(self):
for test in RESULTS:
self.assertEqual(test['expected'], LangidResource.cleanup_result(test['test']))

# @unittest.skipIf(os.path.isfile('../../google_credentials.json'), "Google credentials file is missing")
# def test_langid_google(self):
# for test in TestLangidBlueprint.TESTS:
# result = GoogleLangidProvider.langid(test['text'])
# if type(test['google']) == str:
# self.assertEqual(test['google'], result['result']['language'], test['text'])
# else:
# self.assertTrue(result['result']['language'] in test['google'])

@unittest.skipIf(os.path.isfile('../../google_credentials.json'), "Google credentials file is missing")
def test_langid_google(self):
def test_langid_hybrid(self):
for test in TestLangidBlueprint.TESTS:
result = GoogleLangidProvider.langid(test['text'])
result = HybridLangidProvider.langid(test['text'])
if type(test['google']) == str:
self.assertEqual(test['google'], result['result']['language'], test['text'])
else:
Expand Down

0 comments on commit b2ed0b7

Please sign in to comment.