Merge pull request #32 from meedan/cv2-3408-language-codes

Cv2 3408 language code standardization
meedan · Sep 22, 2023 · a3c8071 · a3c8071
2 parents 72487b0 + 8bfbd54
commit a3c8071
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 9 deletions.
diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py
@@ -3,9 +3,12 @@
 import fasttext
 from huggingface_hub import hf_hub_download
 
+from langcodes import standardize_tag
+
 from lib.model.model import Model
 from lib import schemas
 
+
 class FasttextModel(Model):
     def __init__(self):
         """
@@ -25,7 +28,18 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s
         detectable_texts = [e.body.text for e in docs]
         detected_langs = []
         for text in detectable_texts:
-            detected_langs.append(self.model.predict(text)[0][0])
+            model_output = self.model.predict(text)  #format (('__label__zho_Hans',), array([0.81644011])), where zho is 3-letter ISO code, Hans is script tag, and 0.81 is certainty
+            model_certainty = model_output[1][0].round(4)  #value representing model's certainty
+
+            #standardize_tag will standardize to 2-letter codes where possible
+            #and will remove the script tag unless the language is often written in different scripts
+            #setting macro=True allows us to standardize ISO macro languages (eg. swa == swh -> sw)
+            model_language = standardize_tag(model_output[0][0][9:], macro = True)  
+            model_script = None
+            if '-' in model_language: 
+                [model_language, model_script] = model_language.split('-')
+
+            detected_langs.append({'language': model_language, 'script': model_script, 'score': model_certainty})  
 
         for doc, detected_lang in zip(docs, detected_langs):
             doc.response = detected_lang

diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,7 @@ uvicorn[standard]==0.19.0
 httpx==0.23.1
 huggingface-hub==0.11.0
 fasttext==0.9.2
+langcodes==3.3.0 
 requests==2.31.0
 pytest==7.4.0
 sentry-sdk==1.30.0
diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py
@@ -1,5 +1,6 @@
 import unittest
 from unittest.mock import patch, MagicMock
+import numpy as np
 from lib.model.fasttext import FasttextModel
 from lib import schemas
 
@@ -12,16 +13,12 @@ def setUp(self):
     def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download):
         mock_hf_hub_download.return_value = 'mocked_path'
         mock_fasttext_load_model.return_value = self.mock_model
-        self.mock_model.predict.return_value = (['__label__eng_Latn'], [0.9])
-
+        self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9]))
         model = FasttextModel()  # Now it uses mocked functions
-        query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="今天是星期二"))]
-
+        query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))]
         response = model.respond(query)
-
-        self.assertEqual(len(response), 2)
-        self.assertEqual(response[0].response, '__label__eng_Latn')
-        self.assertEqual(response[1].response, '__label__eng_Latn')  # Mocked, so it will be the same
+        self.assertEqual(len(response), 1)
+        self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 0.9})
 
 if __name__ == '__main__':
     unittest.main()