From 7b50c8960cd97d10c039b86a16d4c2ada67dba70 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Thu, 24 Aug 2023 13:41:31 -0700 Subject: [PATCH 01/13] add langcodes package --- lib/model/fasttext.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index 86a5e97..f2a50e2 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -3,6 +3,8 @@ import fasttext from huggingface_hub import hf_hub_download +from langcodes import * + from lib.model.model import Model from lib import schemas @@ -25,7 +27,7 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s detectable_texts = [e.body.text for e in docs] detected_langs = [] for text in detectable_texts: - detected_langs.append(self.model.predict(text)[0][0]) + detected_langs.append(standardize_tag(self.model.predict(text)[0][0][9:], macro = True)) for doc, detected_lang in zip(docs, detected_langs): doc.response = detected_lang From 4332fed33da319af35ebf48e98691b153b18658d Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Thu, 24 Aug 2023 13:43:21 -0700 Subject: [PATCH 02/13] update unittest --- test/lib/model/test_fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index a15946c..0332d29 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -18,8 +18,8 @@ def test_respond(self): response = self.model.respond(query) self.assertEqual(len(response), 2) - self.assertEqual(response[0].response, "__label__eng_Latn") - self.assertEqual(response[1].response, "__label__zho_Hans") + self.assertEqual(response[0].response, "en") + self.assertEqual(response[1].response, "zh-Hans") if __name__ == '__main__': unittest.main() From 1db38f3707a8a77acc3eb19d12300681b2eee98f Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Thu, 24 Aug 2023 13:46:28 -0700 Subject: [PATCH 03/13] update requirements to include langcodes --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index d18ca8c..1594b52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ uvicorn[standard]==0.19.0 httpx==0.23.1 huggingface-hub==0.11.0 fasttext==0.9.2 +langcodes==3.3.0 From 07a82aab8168ddb8bf7f146babe6ce99ade4c794 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Thu, 24 Aug 2023 15:33:30 -0700 Subject: [PATCH 04/13] documentation + restructuring for clarity --- lib/model/fasttext.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index f2a50e2..bce9c22 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -3,11 +3,12 @@ import fasttext from huggingface_hub import hf_hub_download -from langcodes import * +from langcodes import standardize_tag from lib.model.model import Model from lib import schemas + class FasttextModel(Model): def __init__(self): """ @@ -27,7 +28,15 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s detectable_texts = [e.body.text for e in docs] detected_langs = [] for text in detectable_texts: - detected_langs.append(standardize_tag(self.model.predict(text)[0][0][9:], macro = True)) + model_output = self.model.predict(text) #format (('__label__zho_Hans',), array([0.81644011])), where zho is 3-letter ISO code, Hans is script tag, and 0.81 is certainty + model_certainty = model_output[1][0] #float [0, 1] value representing model's certainty + + #standardize_tag will standardize to 2-letter codes where possible + #and will remove the script tag unless the language is often written in different scripts + #setting macro=True allows us to standardize ISO macro languages (eg. swa == swh -> sw) + model_language = standardize_tag(model_output[0][0][9:], macro = True) + + detected_langs.append(model_language) for doc, detected_lang in zip(docs, detected_langs): doc.response = detected_lang From 775e7fc5d36eb8e20cfd9b94f2f36edbe1113fef Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:00:11 -0700 Subject: [PATCH 05/13] include score in output --- lib/model/fasttext.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index bce9c22..9029d71 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -35,8 +35,11 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s #and will remove the script tag unless the language is often written in different scripts #setting macro=True allows us to standardize ISO macro languages (eg. swa == swh -> sw) model_language = standardize_tag(model_output[0][0][9:], macro = True) + model_script = None + if '-' in model_language: + [model_language, model_script] = model_language.split('-') - detected_langs.append(model_language) + detected_langs.append({'language': model_language, 'script': model_script, 'score': model_certainty}) for doc, detected_lang in zip(docs, detected_langs): doc.response = detected_lang From 2ea48eefa4a84c5acd32ac6ded93cdfde9bc97c7 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:02:28 -0700 Subject: [PATCH 06/13] update tests --- test/lib/model/test_fasttext.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 0332d29..bcdb119 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -18,8 +18,8 @@ def test_respond(self): response = self.model.respond(query) self.assertEqual(len(response), 2) - self.assertEqual(response[0].response, "en") - self.assertEqual(response[1].response, "zh-Hans") + self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 1.0000062}) + self.assertEqual(response[1].response, {'language': 'zh', 'script': 'Hans', 'score': 0.83046132}) if __name__ == '__main__': unittest.main() From 5989092f11afa41eab662b08076e1868d9a8d597 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:16:07 -0700 Subject: [PATCH 07/13] round certainty value --- lib/model/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index 9029d71..52e1e3c 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -29,7 +29,7 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s detected_langs = [] for text in detectable_texts: model_output = self.model.predict(text) #format (('__label__zho_Hans',), array([0.81644011])), where zho is 3-letter ISO code, Hans is script tag, and 0.81 is certainty - model_certainty = model_output[1][0] #float [0, 1] value representing model's certainty + model_certainty = model_output[1][0].round(4) #value representing model's certainty #standardize_tag will standardize to 2-letter codes where possible #and will remove the script tag unless the language is often written in different scripts From 405ebc3b385f83f04a462735780ae8c48e2ece28 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:17:49 -0700 Subject: [PATCH 08/13] more tests --- test/lib/model/test_fasttext.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index bcdb119..593e3f9 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -13,13 +13,21 @@ def setUp(self): self.mock_model = MagicMock() def test_respond(self): - query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="今天是星期二"))] + query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), + schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="今天是星期二")), + schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="چھِ کٲشرۍ نٹن گۅرزٕ خنجر وُچھِتھ اَژان لرزٕ چھُکھ کانہہ دِلاور وُچھِتھ")), + schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="🐐🐐🐐🐐123")), + schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text=""))] + response = self.model.respond(query) self.assertEqual(len(response), 2) - self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 1.0000062}) - self.assertEqual(response[1].response, {'language': 'zh', 'script': 'Hans', 'score': 0.83046132}) + self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 1.0}) + self.assertEqual(response[1].response, {'language': 'zh', 'script': 'Hans', 'score': 0.8305}) + self.assertEqual(response[2].response, {'language': 'ks', 'script': 'Arab', 'score': 0.9999}) + self.assertEqual(response[3].response, {'language': 'bo', 'script': 'Tibt', 'score': 0.2168}) #non-text content returns random language with low certainty + self.assertEqual(response[3].response, {'language': 'en', 'script': None, 'score': 0.8267}) #empty string returns english with high-ish confidence if __name__ == '__main__': unittest.main() From a31d44381fa5099a9029d195efb5076e78bc8c1b Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:32:13 -0700 Subject: [PATCH 09/13] typo --- test/lib/model/test_fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 593e3f9..35b9675 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -22,7 +22,7 @@ def test_respond(self): response = self.model.respond(query) - self.assertEqual(len(response), 2) + self.assertEqual(len(response), 5) self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 1.0}) self.assertEqual(response[1].response, {'language': 'zh', 'script': 'Hans', 'score': 0.8305}) self.assertEqual(response[2].response, {'language': 'ks', 'script': 'Arab', 'score': 0.9999}) From ea519b3bbd108334666ab50dfb2d608463ed4e19 Mon Sep 17 00:00:00 2001 From: Amy Dunphy <43973948+amydunphy@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:44:19 -0700 Subject: [PATCH 10/13] another typo :( --- test/lib/model/test_fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 35b9675..29decb7 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -27,7 +27,7 @@ def test_respond(self): self.assertEqual(response[1].response, {'language': 'zh', 'script': 'Hans', 'score': 0.8305}) self.assertEqual(response[2].response, {'language': 'ks', 'script': 'Arab', 'score': 0.9999}) self.assertEqual(response[3].response, {'language': 'bo', 'script': 'Tibt', 'score': 0.2168}) #non-text content returns random language with low certainty - self.assertEqual(response[3].response, {'language': 'en', 'script': None, 'score': 0.8267}) #empty string returns english with high-ish confidence + self.assertEqual(response[4].response, {'language': 'en', 'script': None, 'score': 0.8267}) #empty string returns english with high-ish confidence if __name__ == '__main__': unittest.main() From 290a7403771744b1771dbfc7d83795db1d429ed9 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 22 Sep 2023 11:47:30 -0700 Subject: [PATCH 11/13] Update test_fasttext.py --- test/lib/model/test_fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index ab2129e..0a0315c 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -12,7 +12,7 @@ def setUp(self): def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download): mock_hf_hub_download.return_value = 'mocked_path' mock_fasttext_load_model.return_value = self.mock_model - self.mock_model.predict.return_value = (['__label__eng_Latn'], [0.9]) + self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.902323124])) model = FasttextModel() # Now it uses mocked functions query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] response = model.respond(query) From 44a523f4af9ff84aeaac2a30080d985dd3d8372b Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 22 Sep 2023 11:58:57 -0700 Subject: [PATCH 12/13] Update test_fasttext.py --- test/lib/model/test_fasttext.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 0a0315c..f2a9073 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -1,5 +1,6 @@ import unittest from unittest.mock import patch, MagicMock +import numpy as np from lib.model.fasttext import FasttextModel from lib import schemas From 8bfbd549c80dc14af7c0d7d2152e115c2773a1b8 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 22 Sep 2023 12:09:12 -0700 Subject: [PATCH 13/13] Update test_fasttext.py --- test/lib/model/test_fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index f2a9073..be160a2 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -13,7 +13,7 @@ def setUp(self): def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download): mock_hf_hub_download.return_value = 'mocked_path' mock_fasttext_load_model.return_value = self.mock_model - self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.902323124])) + self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9])) model = FasttextModel() # Now it uses mocked functions query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] response = model.respond(query)