Skip to content

Commit

Permalink
Incorporate CLD3 too. Fix misc.
Browse files Browse the repository at this point in the history
  • Loading branch information
computermacgyver committed Sep 25, 2024
1 parent b2ed0b7 commit 2b6c38a
Showing 1 changed file with 36 additions and 11 deletions.
47 changes: 36 additions & 11 deletions app/main/lib/langid.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ def langid(text):
'language': response[0]['language'],
'confidence': response[0]['confidence']
},
'raw': response
'raw': response,
'model': 'Google',
}

@staticmethod
Expand Down Expand Up @@ -70,7 +71,8 @@ def langid(text):
'language': prediction.language,
'confidence': prediction.probability
},
'raw': prediction
'raw': prediction,
'model': 'CLD3',
}

@staticmethod
Expand All @@ -83,16 +85,23 @@ class FastTextLangidProvider:
fasttext_model = fasttext.load_model("extra/fasttext_language_id/lid.176.ftz")
@staticmethod
def langid(text):
prediction = FastTextLangidProvider.fasttext_model.predict(text)
prediction = list(FastTextLangidProvider.fasttext_model.predict(text))
# prediction is a list of tuples, e.g., [('__label__en',), array([0.22517213])]

language = prediction[0][0].split("__")[-1]
language = "fil" if language=="tl" # Use 'fil' for Filipino rather than tl for Tagalog
# prediction is a list of tuples, e.g., (('__label__en',), array([0.22517213]))
prediction[1] = prediction[1].tolist()

# Use 'fil' for Filipino rather than tl for Tagalog
if language=="tl":
language = "fil"

return {
'result': {
'language': language,
'confidence': prediction[1][0]
},
'raw': prediction
'raw': prediction,
'model': 'FastText',
}

@staticmethod
Expand All @@ -104,13 +113,29 @@ class HybridLangidProvider:
@staticmethod
def langid(text):
fasttext = FastTextLangidProvider.langid(text)
if fasttext['result']['confience']<0.8:
cld = Cld3LangidProvider.langid(text)
max_confidence = max(fasttext['result']['confidence'], cld['result']['confidence'])

if fasttext['result']['language']==cld['result']['language'] or max_confidence>=0.8:
# FastText and CLD agree or one of them is more than 80% confident.
# Return the higher confidence result
if fasttext['result']['language']!=cld['result']['language']:
# Log when there is disagreement
app.logger.debug(json.dumps({
'service':'LangId',
'message': 'Disagreement between fasttext and cld. Returning higher confidence model',
'parameters':{'text':text, 'fasttext':fasttext, 'cld':cld,},
}))
if fasttext['result']['confidence']>cld['result']['confidence']:
return fasttext
else:
return cld
else:
# Fallback to Google when models disagree and neither has a high-confidence result
google = GoogleLangidProvider.langid(text)
app.logger.debug(json.dumps({
'service':'LangId',
'message': 'Called Google after inconclusive FastText results',
'parameters':{'text':text, 'fasttext':fasttext, 'google':google},
'message': 'Called Google after inconclusive local results',
'parameters':{'text':text, 'fasttext':fasttext, 'cld':cld, 'google':google},
}))
return google
else:
return fasttext

0 comments on commit 2b6c38a

Please sign in to comment.