From 7a1f37e0a600ea20a652565863fd454040f6c88e Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Thu, 22 Aug 2024 14:07:03 -0700 Subject: [PATCH 1/2] CV2-5122 add paraphrase multilingual --- lib/model/__pycache__/fptg.cpython-39.pyc | Bin 688 -> 671 bytes .../__pycache__/indian_sbert.cpython-39.pyc | Bin 665 -> 665 bytes lib/model/paraphrase_multilingual.py | 9 +++++ test/lib/model/test_indian_sbert.py | 6 ++-- .../lib/model/test_paraphrase_multilingual.py | 32 ++++++++++++++++++ 5 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 lib/model/paraphrase_multilingual.py create mode 100644 test/lib/model/test_paraphrase_multilingual.py diff --git a/lib/model/__pycache__/fptg.cpython-39.pyc b/lib/model/__pycache__/fptg.cpython-39.pyc index dc5a4af8f0b2218f46d9d7bb37448a62ee17f9a3..01d9571c4d5cb82a1531f731ac4772b27db633cc 100644 GIT binary patch delta 344 zcmX|*F-yZx5XbM{d#R;1x(N=#8;THw1e|md6`OXj2(8OoLiADzq)m7ULW_f7p>xSB zeg(gbOTR%UXE)y~;=%ooJAVJW`*c1WH+J0?n5f>rlitcb3Oc4CAk1Kb3K57Y76Ne+ z3m}o;8&{m1A$X0A^{rl_Q)y}f3JDY=RD20zBu0GERALDNl*Pr`xy>obHe~2}A5R8o zrfeRzTD*B{=!as-MYFQUH774U)|I{ k=pAwc+bo5VCJNNH8+f9=XL8r>(mPgo{mPD;7HS*l7txSSu>b%7 delta 341 zcmZ8c%Sr=55S*Sz5@Hqv5fPMl5OERQfC}D)c+!K>gL80Lw`b$xKH|)tyc)=P*$>c9 z@MApX5A-XX0Y$J6O%>H$-SsNoMGyx;pSYUdo#3zuwy#ExAVJ7U0TRGtn$vKOb-d+dO(Y1^#b+1Hg!DC@@$rsRr7Tqh?AgA;B+{OZmc#o!fBkQ(OHE$N*;9b7|o2f zb=AJX#(u|LvDCi;&zt}44wcHvtX3)%?Fa0(oQ6l-jf&r;9vIIr!;W0HKQUDOm%|!w Y-Hz$b3Uk18edFyD-Rt|q13CZ>-w+p43jhEB diff --git a/lib/model/__pycache__/indian_sbert.cpython-39.pyc b/lib/model/__pycache__/indian_sbert.cpython-39.pyc index 7eaafb95b348d596565a1789e4a1eed991d1aa7a..2e88dcd45f51a207f91a7d5ba89e2a5685e154f1 100644 GIT binary patch delta 333 zcmX|)ze)o^5XNVA_u>(AK7bIjMHCJp0V}Z)BmRkCnrO4lu{s+#xLm^CAsEsbNFA<9 z;j8!{DeY2O*}E&^zfLVC5(_5@kOY_5(FrZi%oDpA(L&%yS|Sf(H+@r zP?vwz&(l)&b4qnSd@{C_V~=;_wN7pJDvMI*q%FzO#7doI^2I!t6PwPAEjs^CszW>t z4!U|et>)Nj1}>{$PC(4|e&ahylDeQIiEGsdwi}yz@8aR|HW~KSRrQJSK6kM1LO2Og bp!M4z7OFaDtzG}j?s(x<%8o)$H5~N=`)Nyb delta 358 zcmYk1y-EW?5XWcsBMHe}rFM#q2#1ih5J@W*K?@6G3b#2fw|jcz?h%>?>LJN8ak)@hq>nnCi>1-FUWg)} ziPB9Ak$)p%>lwr4lj%5HpO&}+_`LhiV;IMImDh0`@#YPWnoo?*n7iV?_dPVezll2S ip8a_%uK)hB##7tof*cO0Q>N<+Yv$y*?@a>|Aj2=Ou2|*( diff --git a/lib/model/paraphrase_multilingual.py b/lib/model/paraphrase_multilingual.py new file mode 100644 index 00000000..05dcc4e6 --- /dev/null +++ b/lib/model/paraphrase_multilingual.py @@ -0,0 +1,9 @@ +from lib.model.generic_transformer import GenericTransformerModel +MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' +class Model(GenericTransformerModel): + BATCH_SIZE = 100 + def __init__(self): + """ + Init ParaphraseMultilingual model. Fairly standard for all vectorizers. + """ + super().__init__(MODEL_NAME) diff --git a/test/lib/model/test_indian_sbert.py b/test/lib/model/test_indian_sbert.py index 0115dfe6..2a387097 100644 --- a/test/lib/model/test_indian_sbert.py +++ b/test/lib/model/test_indian_sbert.py @@ -7,13 +7,13 @@ from lib.model.generic_transformer import GenericTransformerModel from lib import schemas -class TestIndianSbert(unittest.TestCase): +class TestParaphraseMultilingual(unittest.TestCase): def setUp(self): self.model = GenericTransformerModel(None) self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "indian_sbert__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "indian_sbert__Model"})] + texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "paraphrase_multilingual__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "paraphrase_multilingual__Model"})] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "indian_sbert__Model"}) + query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "paraphrase_multilingual__Model"}) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_paraphrase_multilingual.py b/test/lib/model/test_paraphrase_multilingual.py new file mode 100644 index 00000000..0115dfe6 --- /dev/null +++ b/test/lib/model/test_paraphrase_multilingual.py @@ -0,0 +1,32 @@ +import os +import unittest +from unittest.mock import MagicMock + +import numpy as np + +from lib.model.generic_transformer import GenericTransformerModel +from lib import schemas + +class TestIndianSbert(unittest.TestCase): + def setUp(self): + self.model = GenericTransformerModel(None) + self.mock_model = MagicMock() + + def test_vectorize(self): + texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "indian_sbert__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "indian_sbert__Model"})] + self.model.model = self.mock_model + self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) + vectors = self.model.vectorize(texts) + self.assertEqual(len(vectors), 2) + self.assertEqual(vectors[0], [4, 5, 6]) + self.assertEqual(vectors[1], [7, 8, 9]) + + def test_respond(self): + query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "indian_sbert__Model"}) + self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) + response = self.model.respond(query) + self.assertEqual(len(response), 1) + self.assertEqual(response[0].body.result, [1, 2, 3]) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From 1fab4ddb37688e54584203b0a1345edb638a6d9e Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 23 Aug 2024 10:58:31 -0700 Subject: [PATCH 2/2] fix file swap --- test/lib/model/test_indian_sbert.py | 8 ++++---- test/lib/model/test_paraphrase_multilingual.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/lib/model/test_indian_sbert.py b/test/lib/model/test_indian_sbert.py index 2a387097..7c21fa6c 100644 --- a/test/lib/model/test_indian_sbert.py +++ b/test/lib/model/test_indian_sbert.py @@ -7,13 +7,13 @@ from lib.model.generic_transformer import GenericTransformerModel from lib import schemas -class TestParaphraseMultilingual(unittest.TestCase): +class TestIndianSbert(unittest.TestCase): def setUp(self): self.model = GenericTransformerModel(None) self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "paraphrase_multilingual__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "paraphrase_multilingual__Model"})] + texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "indian_sbert__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "indian_sbert__Model"})] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,11 +22,11 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "paraphrase_multilingual__Model"}) + query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "indian_sbert__Model"}) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) self.assertEqual(response[0].body.result, [1, 2, 3]) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/lib/model/test_paraphrase_multilingual.py b/test/lib/model/test_paraphrase_multilingual.py index 0115dfe6..2a387097 100644 --- a/test/lib/model/test_paraphrase_multilingual.py +++ b/test/lib/model/test_paraphrase_multilingual.py @@ -7,13 +7,13 @@ from lib.model.generic_transformer import GenericTransformerModel from lib import schemas -class TestIndianSbert(unittest.TestCase): +class TestParaphraseMultilingual(unittest.TestCase): def setUp(self): self.model = GenericTransformerModel(None) self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "indian_sbert__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "indian_sbert__Model"})] + texts = [schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, "model_name": "paraphrase_multilingual__Model"}), schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, "model_name": "paraphrase_multilingual__Model"})] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "indian_sbert__Model"}) + query = schemas.parse_message({"body": {"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, "model_name": "paraphrase_multilingual__Model"}) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1)