Skip to content

Commit

Permalink
Merge pull request #32 from meedan/cv2-3408-language-codes
Browse files Browse the repository at this point in the history
Cv2 3408 language code standardization
  • Loading branch information
DGaffney authored Sep 22, 2023
2 parents 72487b0 + 8bfbd54 commit a3c8071
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 9 deletions.
16 changes: 15 additions & 1 deletion lib/model/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import fasttext
from huggingface_hub import hf_hub_download

from langcodes import standardize_tag

from lib.model.model import Model
from lib import schemas


class FasttextModel(Model):
def __init__(self):
"""
Expand All @@ -25,7 +28,18 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s
detectable_texts = [e.body.text for e in docs]
detected_langs = []
for text in detectable_texts:
detected_langs.append(self.model.predict(text)[0][0])
model_output = self.model.predict(text) #format (('__label__zho_Hans',), array([0.81644011])), where zho is 3-letter ISO code, Hans is script tag, and 0.81 is certainty
model_certainty = model_output[1][0].round(4) #value representing model's certainty

#standardize_tag will standardize to 2-letter codes where possible
#and will remove the script tag unless the language is often written in different scripts
#setting macro=True allows us to standardize ISO macro languages (eg. swa == swh -> sw)
model_language = standardize_tag(model_output[0][0][9:], macro = True)
model_script = None
if '-' in model_language:
[model_language, model_script] = model_language.split('-')

detected_langs.append({'language': model_language, 'script': model_script, 'score': model_certainty})

for doc, detected_lang in zip(docs, detected_langs):
doc.response = detected_lang
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ uvicorn[standard]==0.19.0
httpx==0.23.1
huggingface-hub==0.11.0
fasttext==0.9.2
langcodes==3.3.0
requests==2.31.0
pytest==7.4.0
sentry-sdk==1.30.0
13 changes: 5 additions & 8 deletions test/lib/model/test_fasttext.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest
from unittest.mock import patch, MagicMock
import numpy as np
from lib.model.fasttext import FasttextModel
from lib import schemas

Expand All @@ -12,16 +13,12 @@ def setUp(self):
def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download):
mock_hf_hub_download.return_value = 'mocked_path'
mock_fasttext_load_model.return_value = self.mock_model
self.mock_model.predict.return_value = (['__label__eng_Latn'], [0.9])

self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9]))
model = FasttextModel() # Now it uses mocked functions
query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="今天是星期二"))]

query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))]
response = model.respond(query)

self.assertEqual(len(response), 2)
self.assertEqual(response[0].response, '__label__eng_Latn')
self.assertEqual(response[1].response, '__label__eng_Latn') # Mocked, so it will be the same
self.assertEqual(len(response), 1)
self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 0.9})

if __name__ == '__main__':
unittest.main()

0 comments on commit a3c8071

Please sign in to comment.