From 8a32cdf4efb7d3cf8ef94c17543f5207eccf0425 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 3 Nov 2023 13:53:16 -0400 Subject: [PATCH 1/9] CV2-2875 tweak response structure --- lib/model/generic_transformer.py | 2 +- lib/schemas.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/model/generic_transformer.py b/lib/model/generic_transformer.py index 56ded7c..02d0b73 100644 --- a/lib/model/generic_transformer.py +++ b/lib/model/generic_transformer.py @@ -25,7 +25,7 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s vectorizable_texts = [e.body.text for e in docs] vectorized = self.vectorize(vectorizable_texts) for doc, vector in zip(docs, vectorized): - doc.response = vector + doc.hash_value = vector return docs def vectorize(self, texts: List[str]) -> List[List[float]]: diff --git a/lib/schemas.py b/lib/schemas.py index 26bdc03..ccccd8f 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -12,6 +12,7 @@ class TextOutput(BaseModel): id: str callback_url: str text: str + hash_value: HashValue class VideoInput(BaseModel): id: str From e7fec723af276f4a945852e37588b509e9bc90d6 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 3 Nov 2023 14:02:20 -0400 Subject: [PATCH 2/9] fix response body --- lib/model/fasttext.py | 2 +- lib/model/generic_transformer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index 52e1e3c..c3fa660 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -42,5 +42,5 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s detected_langs.append({'language': model_language, 'script': model_script, 'score': model_certainty}) for doc, detected_lang in zip(docs, detected_langs): - doc.response = detected_lang + doc.body.hash_value = detected_lang return docs diff --git a/lib/model/generic_transformer.py b/lib/model/generic_transformer.py index 02d0b73..a447150 100644 --- a/lib/model/generic_transformer.py +++ b/lib/model/generic_transformer.py @@ -25,7 +25,7 @@ def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[s vectorizable_texts = [e.body.text for e in docs] vectorized = self.vectorize(vectorizable_texts) for doc, vector in zip(docs, vectorized): - doc.hash_value = vector + doc.body.hash_value = vector return docs def vectorize(self, texts: List[str]) -> List[List[float]]: From 5712a199d46746f43657d38f60d19ddc70bb038f Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 3 Nov 2023 14:36:09 -0400 Subject: [PATCH 3/9] rework response structure to be more consistent across modalities --- lib/model/fasttext.py | 2 +- lib/model/generic_transformer.py | 2 +- lib/model/image.py | 2 +- lib/model/model.py | 2 +- lib/model/video.py | 2 +- lib/schemas.py | 62 +++++------------------------ test/lib/model/test_audio.py | 4 +- test/lib/model/test_fasttext.py | 4 +- test/lib/model/test_fptg.py | 6 +-- test/lib/model/test_generic.py | 6 +-- test/lib/model/test_image.py | 6 +-- test/lib/model/test_indian_sbert.py | 6 +-- test/lib/model/test_meantokens.py | 6 +-- test/lib/model/test_model.py | 2 +- test/lib/model/test_video.py | 6 +-- test/lib/queue/test_queue.py | 2 +- 16 files changed, 40 insertions(+), 80 deletions(-) diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index c3fa660..60b1d49 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -18,7 +18,7 @@ def __init__(self): self.model = fasttext.load_model(model_path) - def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.TextOutput]: + def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.MediaItem]: """ Force messages as list of messages in case we get a singular item. Then, run fingerprint routine. Respond can probably be genericized across all models. diff --git a/lib/model/generic_transformer.py b/lib/model/generic_transformer.py index a447150..2cdaf99 100644 --- a/lib/model/generic_transformer.py +++ b/lib/model/generic_transformer.py @@ -14,7 +14,7 @@ def __init__(self, model_name: str): if model_name: self.model = SentenceTransformer(model_name, cache_folder=os.getenv("MODEL_DIR", "./models")) - def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.TextOutput]: + def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.MediaItem]: """ Force messages as list of messages in case we get a singular item. Then, run fingerprint routine. Respond can probably be genericized across all models. diff --git a/lib/model/image.py b/lib/model/image.py index 41cdf4c..c528b95 100644 --- a/lib/model/image.py +++ b/lib/model/image.py @@ -30,7 +30,7 @@ def get_iobytes_for_image(self, image: schemas.Message) -> io.BytesIO: ).read() ) - def process(self, image: schemas.Message) -> schemas.ImageOutput: + def process(self, image: schemas.Message) -> schemas.MediaItem: """ Generic function for returning the actual response. """ diff --git a/lib/model/model.py b/lib/model/model.py index 0f9d938..094bca2 100644 --- a/lib/model/model.py +++ b/lib/model/model.py @@ -43,7 +43,7 @@ def respond(self, messages: Union[List[schemas.Message], schemas.Message]) -> Li if not isinstance(messages, list): messages = [messages] for message in messages: - message.response = self.process(message) + message.body.hash_value = self.process(message) return messages @classmethod diff --git a/lib/model/video.py b/lib/model/video.py index 8420041..180a5cb 100644 --- a/lib/model/video.py +++ b/lib/model/video.py @@ -41,7 +41,7 @@ def tmk_bucket(self) -> str: """ return "presto_tmk_videos" - def process(self, video: schemas.Message) -> schemas.VideoOutput: + def process(self, video: schemas.Message) -> schemas.VideoItem: """ Main fingerprinting routine - download video to disk, get short hash, then calculate larger TMK hash and upload that to S3. diff --git a/lib/schemas.py b/lib/schemas.py index ccccd8f..a5d20d4 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -3,59 +3,19 @@ # Output hash values can be of different types. HashValue = Union[List[float], str, int] -class TextInput(BaseModel): - id: str - callback_url: str - text: str - -class TextOutput(BaseModel): - id: str - callback_url: str - text: str - hash_value: HashValue - -class VideoInput(BaseModel): - id: str - callback_url: str - url: str - -class VideoOutput(BaseModel): - id: str - callback_url: str - url: str - bucket: str - outfile: str - hash_value: HashValue - -class AudioInput(BaseModel): - id: str - callback_url: str - url: str - -class AudioOutput(BaseModel): - id: str - callback_url: str - url: str - hash_value: HashValue - -class ImageInput(BaseModel): - id: str - callback_url: str - url: str - -class ImageOutput(BaseModel): - id: str - callback_url: str - url: str - hash_value: HashValue - -class GenericInput(BaseModel): - id: str - callback_url: str +class GenericItem(BaseModel): + id: Optional[str] = None + callback_url: Optional[str] = None url: Optional[str] = None text: Optional[str] = None raw: Optional[dict] = {} +class MediaItem(GenericItem): + hash_value: Optional[HashValue] = None + +class VideoItem(MediaItem): + bucket: Optional[str] = None + outfile: Optional[str] = None + class Message(BaseModel): - body: GenericInput - response: Any \ No newline at end of file + body: Union[GenericItem, MediaItem, VideoItem] \ No newline at end of file diff --git a/test/lib/model/test_audio.py b/test/lib/model/test_audio.py index c6359ba..c344826 100644 --- a/test/lib/model/test_audio.py +++ b/test/lib/model/test_audio.py @@ -24,7 +24,7 @@ def test_process_audio_success(self, mock_fingerprint_file, mock_request, mock_u mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.AudioInput(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) @@ -45,7 +45,7 @@ def test_process_audio_failure(self, mock_decode_fingerprint, mock_fingerprint_f mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.AudioInput(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index be160a2..e08a038 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -15,10 +15,10 @@ def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download): mock_fasttext_load_model.return_value = self.mock_model self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9])) model = FasttextModel() # Now it uses mocked functions - query = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] + query = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] response = model.respond(query) self.assertEqual(len(response), 1) - self.assertEqual(response[0].response, {'language': 'en', 'script': None, 'score': 0.9}) + self.assertEqual(response[0].body.hash_value, {'language': 'en', 'script': None, 'score': 0.9}) if __name__ == '__main__': unittest.main() diff --git a/test/lib/model/test_fptg.py b/test/lib/model/test_fptg.py index 5922ec5..2644322 100644 --- a/test/lib/model/test_fptg.py +++ b/test/lib/model/test_fptg.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,11 +22,11 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) - self.assertEqual(response[0].response, [1, 2, 3]) + self.assertEqual(response[0].body.hash_value, [1, 2, 3]) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_generic.py b/test/lib/model/test_generic.py index ccec97d..2455520 100644 --- a/test/lib/model/test_generic.py +++ b/test/lib/model/test_generic.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,11 +22,11 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) - self.assertEqual(response[0].response, [1, 2, 3]) + self.assertEqual(response[0].body.hash_value, [1, 2, 3]) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_image.py b/test/lib/model/test_image.py index 54ddb7e..266228c 100644 --- a/test/lib/model/test_image.py +++ b/test/lib/model/test_image.py @@ -25,7 +25,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): mock_response = Mock() mock_response.read.return_value = image_content mock_urlopen.return_value = mock_response - image = schemas.Message(body=schemas.ImageInput(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) result = Model().get_iobytes_for_image(image) self.assertIsInstance(result, io.BytesIO) self.assertEqual(result.read(), image_content) @@ -33,7 +33,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): @patch("urllib.request.urlopen") def test_get_iobytes_for_image_raises_error(self, mock_urlopen): mock_urlopen.side_effect = URLError('test error') - image = schemas.Message(body=schemas.ImageInput(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) with self.assertRaises(URLError): Model().get_iobytes_for_image(image) @@ -42,7 +42,7 @@ def test_get_iobytes_for_image_raises_error(self, mock_urlopen): def test_process(self, mock_compute_pdq, mock_get_iobytes_for_image): mock_compute_pdq.return_value = "1001" mock_get_iobytes_for_image.return_value = io.BytesIO(b"image_bytes") - image = schemas.Message(body=schemas.ImageInput(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) result = Model().process(image) self.assertEqual(result, {"hash_value": "1001"}) diff --git a/test/lib/model/test_indian_sbert.py b/test/lib/model/test_indian_sbert.py index 4284493..6a85d80 100644 --- a/test/lib/model/test_indian_sbert.py +++ b/test/lib/model/test_indian_sbert.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,11 +22,11 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="What is the capital of India?")) + query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="What is the capital of India?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) - self.assertEqual(response[0].response, [1, 2, 3]) + self.assertEqual(response[0].body.hash_value, [1, 2, 3]) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_meantokens.py b/test/lib/model/test_meantokens.py index cadf9a8..2ee9bea 100644 --- a/test/lib/model/test_meantokens.py +++ b/test/lib/model/test_meantokens.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,11 +22,11 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.TextInput(id="123", callback_url="http://example.com/callback", text="What is the capital of France?")) + query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="What is the capital of France?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) - self.assertEqual(response[0].response, [1, 2, 3]) + self.assertEqual(response[0].body.hash_value, [1, 2, 3]) if __name__ == '__main__': unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_model.py b/test/lib/model/test_model.py index 0ece10d..05be84d 100644 --- a/test/lib/model/test_model.py +++ b/test/lib/model/test_model.py @@ -6,7 +6,7 @@ # class TestModel(unittest.TestCase): # def test_respond(self): # model = Model() -# self.assertEqual(model.respond(schemas.Message(body=schemas.TextInput(id='123', callback_url="http://example.com/callback", text="hello"))), model.respond(schemas.Message(body=schemas.TextInput(id='123', callback_url="http://example.com/callback", text="hello"), response=[]))) +# self.assertEqual(model.respond(schemas.Message(body=schemas.MediaItem(id='123', callback_url="http://example.com/callback", text="hello"))), model.respond(schemas.Message(body=schemas.MediaItem(id='123', callback_url="http://example.com/callback", text="hello"), response=[]))) # # if __name__ == '__main__': # unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_video.py b/test/lib/model/test_video.py index 700438a..5af5bc8 100644 --- a/test/lib/model/test_video.py +++ b/test/lib/model/test_video.py @@ -32,7 +32,7 @@ def test_process_video(self, mock_pathlib, mock_upload_file_to_s3, mock_hash_video_output.getPureAverageFeature.return_value = "hash_value" mock_hash_video.return_value = mock_hash_video_output mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=video_contents)) - self.video_model.process(schemas.Message(body=schemas.VideoInput(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4"))) + self.video_model.process(schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4"))) mock_urlopen.assert_called_once() mock_hash_video.assert_called_once_with(ANY, "/usr/local/bin/ffmpeg") @@ -49,7 +49,7 @@ def test_tmk_program_name(self): self.assertEqual(result, "PrestoVideoEncoder") def test_respond_with_single_video(self): - video = schemas.Message(body=schemas.VideoInput(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4")) + video = schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4")) mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(video) @@ -57,7 +57,7 @@ def test_respond_with_single_video(self): self.assertEqual(result, [video]) def test_respond_with_multiple_videos(self): - videos = [schemas.Message(body=schemas.VideoInput(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video1.mp4")), schemas.Message(body=schemas.VideoInput(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video2.mp4"))] + videos = [schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video1.mp4")), schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video2.mp4"))] mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(videos) diff --git a/test/lib/queue/test_queue.py b/test/lib/queue/test_queue.py index ee0f536..c9d683e 100644 --- a/test/lib/queue/test_queue.py +++ b/test/lib/queue/test_queue.py @@ -104,7 +104,7 @@ def test_push_message(self): # Call push_message returned_message = self.queue.push_message(self.queue_name_output, message_to_push) # Check if the message was correctly serialized and sent - self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}}, "response": null}') + self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}}}') self.assertEqual(returned_message, message_to_push) if __name__ == '__main__': From 1caca5fd65021e5c37165fa84b7e9b5de32e9f55 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 3 Nov 2023 22:07:24 -0400 Subject: [PATCH 4/9] Shift schema due to issues with resolving generic -> specific on callback action --- lib/model/audio.py | 2 +- lib/model/fasttext.py | 2 +- lib/model/generic_transformer.py | 2 +- lib/model/image.py | 4 ++-- lib/model/video.py | 2 +- lib/queue/processor.py | 2 +- lib/schemas.py | 9 ++------- test/lib/model/test_audio.py | 4 ++-- test/lib/model/test_fasttext.py | 2 +- test/lib/model/test_fptg.py | 4 ++-- test/lib/model/test_generic.py | 4 ++-- test/lib/model/test_image.py | 6 +++--- test/lib/model/test_indian_sbert.py | 4 ++-- test/lib/model/test_meantokens.py | 4 ++-- test/lib/model/test_model.py | 2 +- test/lib/model/test_video.py | 6 +++--- 16 files changed, 27 insertions(+), 32 deletions(-) diff --git a/lib/model/audio.py b/lib/model/audio.py index fea03cb..613dedc 100644 --- a/lib/model/audio.py +++ b/lib/model/audio.py @@ -26,4 +26,4 @@ def process(self, audio: schemas.Message) -> Dict[str, Union[str, List[int]]]: hash_value = self.audio_hasher(temp_file_name) finally: os.remove(temp_file_name) - return {"hash_value": hash_value} + return hash_value diff --git a/lib/model/fasttext.py b/lib/model/fasttext.py index 60b1d49..5081c5a 100644 --- a/lib/model/fasttext.py +++ b/lib/model/fasttext.py @@ -18,7 +18,7 @@ def __init__(self): self.model = fasttext.load_model(model_path) - def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.MediaItem]: + def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.GenericItem]: """ Force messages as list of messages in case we get a singular item. Then, run fingerprint routine. Respond can probably be genericized across all models. diff --git a/lib/model/generic_transformer.py b/lib/model/generic_transformer.py index 2cdaf99..46e6089 100644 --- a/lib/model/generic_transformer.py +++ b/lib/model/generic_transformer.py @@ -14,7 +14,7 @@ def __init__(self, model_name: str): if model_name: self.model = SentenceTransformer(model_name, cache_folder=os.getenv("MODEL_DIR", "./models")) - def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.MediaItem]: + def respond(self, docs: Union[List[schemas.Message], schemas.Message]) -> List[schemas.GenericItem]: """ Force messages as list of messages in case we get a singular item. Then, run fingerprint routine. Respond can probably be genericized across all models. diff --git a/lib/model/image.py b/lib/model/image.py index c528b95..7174e8f 100644 --- a/lib/model/image.py +++ b/lib/model/image.py @@ -30,8 +30,8 @@ def get_iobytes_for_image(self, image: schemas.Message) -> io.BytesIO: ).read() ) - def process(self, image: schemas.Message) -> schemas.MediaItem: + def process(self, image: schemas.Message) -> schemas.GenericItem: """ Generic function for returning the actual response. """ - return {"hash_value": self.compute_pdq(self.get_iobytes_for_image(image))} + return self.compute_pdq(self.get_iobytes_for_image(image)) diff --git a/lib/model/video.py b/lib/model/video.py index 180a5cb..569d8b8 100644 --- a/lib/model/video.py +++ b/lib/model/video.py @@ -41,7 +41,7 @@ def tmk_bucket(self) -> str: """ return "presto_tmk_videos" - def process(self, video: schemas.Message) -> schemas.VideoItem: + def process(self, video: schemas.Message) -> schemas.GenericItem: """ Main fingerprinting routine - download video to disk, get short hash, then calculate larger TMK hash and upload that to S3. diff --git a/lib/queue/processor.py b/lib/queue/processor.py index 1567070..a7a5499 100644 --- a/lib/queue/processor.py +++ b/lib/queue/processor.py @@ -37,7 +37,7 @@ def send_callbacks(self) -> List[schemas.Message]: """ messages_with_queues = self.receive_messages(self.batch_size) if messages_with_queues: - logger.debug(f"About to respond to: ({messages_with_queues})") + logger.info(f"About to respond to: ({messages_with_queues})") bodies = [schemas.Message(**json.loads(message.body)) for message, queue in messages_with_queues] for body in bodies: self.send_callback(body) diff --git a/lib/schemas.py b/lib/schemas.py index a5d20d4..991aca5 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -2,20 +2,15 @@ from pydantic import BaseModel # Output hash values can be of different types. -HashValue = Union[List[float], str, int] class GenericItem(BaseModel): id: Optional[str] = None callback_url: Optional[str] = None url: Optional[str] = None text: Optional[str] = None raw: Optional[dict] = {} - -class MediaItem(GenericItem): - hash_value: Optional[HashValue] = None - -class VideoItem(MediaItem): + hash_value: Optional[Any] = None bucket: Optional[str] = None outfile: Optional[str] = None class Message(BaseModel): - body: Union[GenericItem, MediaItem, VideoItem] \ No newline at end of file + body: Union[GenericItem] \ No newline at end of file diff --git a/test/lib/model/test_audio.py b/test/lib/model/test_audio.py index c344826..306045d 100644 --- a/test/lib/model/test_audio.py +++ b/test/lib/model/test_audio.py @@ -24,7 +24,7 @@ def test_process_audio_success(self, mock_fingerprint_file, mock_request, mock_u mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) @@ -45,7 +45,7 @@ def test_process_audio_failure(self, mock_decode_fingerprint, mock_fingerprint_f mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index e08a038..1eb43b0 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -15,7 +15,7 @@ def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download): mock_fasttext_load_model.return_value = self.mock_model self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9])) model = FasttextModel() # Now it uses mocked functions - query = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] + query = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] response = model.respond(query) self.assertEqual(len(response), 1) self.assertEqual(response[0].body.hash_value, {'language': 'en', 'script': None, 'score': 0.9}) diff --git a/test/lib/model/test_fptg.py b/test/lib/model/test_fptg.py index 2644322..a6789ba 100644 --- a/test/lib/model/test_fptg.py +++ b/test/lib/model/test_fptg.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_generic.py b/test/lib/model/test_generic.py index 2455520..c6508b2 100644 --- a/test/lib/model/test_generic.py +++ b/test/lib/model/test_generic.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_image.py b/test/lib/model/test_image.py index 266228c..d8568d2 100644 --- a/test/lib/model/test_image.py +++ b/test/lib/model/test_image.py @@ -25,7 +25,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): mock_response = Mock() mock_response.read.return_value = image_content mock_urlopen.return_value = mock_response - image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) result = Model().get_iobytes_for_image(image) self.assertIsInstance(result, io.BytesIO) self.assertEqual(result.read(), image_content) @@ -33,7 +33,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): @patch("urllib.request.urlopen") def test_get_iobytes_for_image_raises_error(self, mock_urlopen): mock_urlopen.side_effect = URLError('test error') - image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) with self.assertRaises(URLError): Model().get_iobytes_for_image(image) @@ -42,7 +42,7 @@ def test_get_iobytes_for_image_raises_error(self, mock_urlopen): def test_process(self, mock_compute_pdq, mock_get_iobytes_for_image): mock_compute_pdq.return_value = "1001" mock_get_iobytes_for_image.return_value = io.BytesIO(b"image_bytes") - image = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) result = Model().process(image) self.assertEqual(result, {"hash_value": "1001"}) diff --git a/test/lib/model/test_indian_sbert.py b/test/lib/model/test_indian_sbert.py index 6a85d80..d4bfcb8 100644 --- a/test/lib/model/test_indian_sbert.py +++ b/test/lib/model/test_indian_sbert.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="What is the capital of India?")) + query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="What is the capital of India?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_meantokens.py b/test/lib/model/test_meantokens.py index 2ee9bea..f11791a 100644 --- a/test/lib/model/test_meantokens.py +++ b/test/lib/model/test_meantokens.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.MediaItem(id="123", callback_url="http://example.com/callback", text="What is the capital of France?")) + query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="What is the capital of France?")) self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_model.py b/test/lib/model/test_model.py index 05be84d..a65b9d7 100644 --- a/test/lib/model/test_model.py +++ b/test/lib/model/test_model.py @@ -6,7 +6,7 @@ # class TestModel(unittest.TestCase): # def test_respond(self): # model = Model() -# self.assertEqual(model.respond(schemas.Message(body=schemas.MediaItem(id='123', callback_url="http://example.com/callback", text="hello"))), model.respond(schemas.Message(body=schemas.MediaItem(id='123', callback_url="http://example.com/callback", text="hello"), response=[]))) +# self.assertEqual(model.respond(schemas.Message(body=schemas.GenericItem(id='123', callback_url="http://example.com/callback", text="hello"))), model.respond(schemas.Message(body=schemas.GenericItem(id='123', callback_url="http://example.com/callback", text="hello"), response=[]))) # # if __name__ == '__main__': # unittest.main() \ No newline at end of file diff --git a/test/lib/model/test_video.py b/test/lib/model/test_video.py index 5af5bc8..0bc4105 100644 --- a/test/lib/model/test_video.py +++ b/test/lib/model/test_video.py @@ -32,7 +32,7 @@ def test_process_video(self, mock_pathlib, mock_upload_file_to_s3, mock_hash_video_output.getPureAverageFeature.return_value = "hash_value" mock_hash_video.return_value = mock_hash_video_output mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=video_contents)) - self.video_model.process(schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4"))) + self.video_model.process(schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4"))) mock_urlopen.assert_called_once() mock_hash_video.assert_called_once_with(ANY, "/usr/local/bin/ffmpeg") @@ -49,7 +49,7 @@ def test_tmk_program_name(self): self.assertEqual(result, "PrestoVideoEncoder") def test_respond_with_single_video(self): - video = schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4")) + video = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4")) mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(video) @@ -57,7 +57,7 @@ def test_respond_with_single_video(self): self.assertEqual(result, [video]) def test_respond_with_multiple_videos(self): - videos = [schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video1.mp4")), schemas.Message(body=schemas.VideoItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video2.mp4"))] + videos = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video1.mp4")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video2.mp4"))] mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(videos) From bc9af913649ef44b527203bd406e8dc0965ad185 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Fri, 3 Nov 2023 22:20:35 -0400 Subject: [PATCH 5/9] fix tests --- test/lib/model/test_audio.py | 4 ++-- test/lib/model/test_image.py | 2 +- test/lib/queue/test_queue.py | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/lib/model/test_audio.py b/test/lib/model/test_audio.py index 306045d..2c815c6 100644 --- a/test/lib/model/test_audio.py +++ b/test/lib/model/test_audio.py @@ -28,7 +28,7 @@ def test_process_audio_success(self, mock_fingerprint_file, mock_request, mock_u result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) - self.assertEqual(list, type(result["hash_value"])) + self.assertEqual(list, type(result)) @patch('urllib.request.urlopen') @patch('urllib.request.Request') @@ -49,7 +49,7 @@ def test_process_audio_failure(self, mock_decode_fingerprint, mock_fingerprint_f result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) - self.assertEqual([], result["hash_value"]) + self.assertEqual([], result) if __name__ == '__main__': unittest.main() diff --git a/test/lib/model/test_image.py b/test/lib/model/test_image.py index d8568d2..3af83dc 100644 --- a/test/lib/model/test_image.py +++ b/test/lib/model/test_image.py @@ -44,7 +44,7 @@ def test_process(self, mock_compute_pdq, mock_get_iobytes_for_image): mock_get_iobytes_for_image.return_value = io.BytesIO(b"image_bytes") image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) result = Model().process(image) - self.assertEqual(result, {"hash_value": "1001"}) + self.assertEqual(result, "1001") if __name__ == "__main__": diff --git a/test/lib/queue/test_queue.py b/test/lib/queue/test_queue.py index c9d683e..79f4903 100644 --- a/test/lib/queue/test_queue.py +++ b/test/lib/queue/test_queue.py @@ -104,7 +104,8 @@ def test_push_message(self): # Call push_message returned_message = self.queue.push_message(self.queue_name_output, message_to_push) # Check if the message was correctly serialized and sent - self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}}}') + self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}, "hash_value": null, "bucket": null, "outfile": null}}') + self.assertEqual(returned_message, message_to_push) if __name__ == '__main__': From 8062f641d5be328b261db29df22a5e608005b766 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 8 Nov 2023 05:55:48 -0800 Subject: [PATCH 6/9] update schema and message passing to be explicit about type --- README.md | 2 +- lib/model/model.py | 3 +++ lib/queue/worker.py | 2 +- lib/schemas.py | 18 ++++++++++++++++-- 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fb9cb9a..0499564 100644 --- a/README.md +++ b/README.md @@ -206,4 +206,4 @@ If no callback_url is provided: { "message": "No Message Callback, Passing" } -``` +``` \ No newline at end of file diff --git a/lib/model/model.py b/lib/model/model.py index 094bca2..eb99df7 100644 --- a/lib/model/model.py +++ b/lib/model/model.py @@ -9,6 +9,9 @@ from lib import schemas class Model(ABC): BATCH_SIZE = 1 + def __init__(self): + self.model_name = os.environ.get("MODEL_NAME") + def get_tempfile_for_url(self, url: str) -> str: """ Loads a file based on specified URL into a named tempfile. diff --git a/lib/queue/worker.py b/lib/queue/worker.py index 7e7a4e1..3b0f2df 100644 --- a/lib/queue/worker.py +++ b/lib/queue/worker.py @@ -52,7 +52,7 @@ def safely_respond(self, model: Model) -> List[schemas.Message]: if messages_with_queues: logger.debug(f"About to respond to: ({messages_with_queues})") try: - responses = model.respond([schemas.Message(**json.loads(message.body)) for message, queue in messages_with_queues]) + responses = model.respond([schemas.Message(**{**json.loads(message.body), **{"model_name": self.model.model_name}}) for message, queue in messages_with_queues]) except Exception as e: logger.error(e) self.delete_messages(messages_with_queues) diff --git a/lib/schemas.py b/lib/schemas.py index 991aca5..985e594 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -1,5 +1,5 @@ from typing import Any, List, Optional, Union -from pydantic import BaseModel +from pydantic import BaseModel, root_validator # Output hash values can be of different types. class GenericItem(BaseModel): @@ -8,9 +8,23 @@ class GenericItem(BaseModel): url: Optional[str] = None text: Optional[str] = None raw: Optional[dict] = {} + +class MediaItem(BaseModel): hash_value: Optional[Any] = None + +class VideoItem(MediaItem): bucket: Optional[str] = None outfile: Optional[str] = None class Message(BaseModel): - body: Union[GenericItem] \ No newline at end of file + body: Union[MediaItem, VideoItem] + model_name: str + @root_validator(pre=True) + def set_body(cls, values): + body = values.get("body") + model_name = values.get("model_name") + if model_name == "video__Model": + values["body"] = VideoItem(**values["body"]) + if model_name in ["audio__Model", "image__Model", "fptg__Model", "indian_sbert__Model", "mean_tokens__Model", "fasttext__Model"] + values["body"] = MediaItem(**values["body"]) + return values From 7444044592f732c65d20b10b2d5ebdb57119eaba Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 8 Nov 2023 06:00:02 -0800 Subject: [PATCH 7/9] fix typo --- lib/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/schemas.py b/lib/schemas.py index 985e594..ecd8a67 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -25,6 +25,6 @@ def set_body(cls, values): model_name = values.get("model_name") if model_name == "video__Model": values["body"] = VideoItem(**values["body"]) - if model_name in ["audio__Model", "image__Model", "fptg__Model", "indian_sbert__Model", "mean_tokens__Model", "fasttext__Model"] + if model_name in ["audio__Model", "image__Model", "fptg__Model", "indian_sbert__Model", "mean_tokens__Model", "fasttext__Model"]: values["body"] = MediaItem(**values["body"]) return values From c02449a7d7989b8f90f183723de23b8193e60e08 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 8 Nov 2023 10:43:56 -0800 Subject: [PATCH 8/9] redo test fixtures and readd specific schemas --- lib/http.py | 2 +- lib/schemas.py | 8 ++++---- test/lib/model/test_audio.py | 4 ++-- test/lib/model/test_fasttext.py | 2 +- test/lib/model/test_fptg.py | 4 ++-- test/lib/model/test_generic.py | 4 ++-- test/lib/model/test_image.py | 6 +++--- test/lib/model/test_indian_sbert.py | 4 ++-- test/lib/model/test_meantokens.py | 4 ++-- test/lib/model/test_video.py | 6 +++--- test/lib/queue/test_processor.py | 6 +++--- test/lib/queue/test_queue.py | 4 ++-- test/lib/test_http.py | 6 +++--- 13 files changed, 30 insertions(+), 30 deletions(-) diff --git a/lib/http.py b/lib/http.py index 3544333..bfb0818 100644 --- a/lib/http.py +++ b/lib/http.py @@ -35,7 +35,7 @@ def process_item(process_name: str, message: Dict[str, Any]): logger.info(message) queue_prefix = Queue.get_queue_prefix() queue = QueueWorker.create(process_name) - queue.push_message(f"{queue_prefix}{process_name}", schemas.Message(body=message)) + queue.push_message(f"{queue_prefix}{process_name}", schemas.Message(body=message, model_name=process_name)) return {"message": "Message pushed successfully", "queue": process_name, "body": message} @app.post("/trigger_callback") diff --git a/lib/schemas.py b/lib/schemas.py index ecd8a67..6679c20 100644 --- a/lib/schemas.py +++ b/lib/schemas.py @@ -3,13 +3,13 @@ # Output hash values can be of different types. class GenericItem(BaseModel): - id: Optional[str] = None + id: str callback_url: Optional[str] = None url: Optional[str] = None text: Optional[str] = None raw: Optional[dict] = {} -class MediaItem(BaseModel): +class MediaItem(GenericItem): hash_value: Optional[Any] = None class VideoItem(MediaItem): @@ -24,7 +24,7 @@ def set_body(cls, values): body = values.get("body") model_name = values.get("model_name") if model_name == "video__Model": - values["body"] = VideoItem(**values["body"]) + values["body"] = VideoItem(**values["body"]).dict() if model_name in ["audio__Model", "image__Model", "fptg__Model", "indian_sbert__Model", "mean_tokens__Model", "fasttext__Model"]: - values["body"] = MediaItem(**values["body"]) + values["body"] = MediaItem(**values["body"]).dict() return values diff --git a/test/lib/model/test_audio.py b/test/lib/model/test_audio.py index 2c815c6..48c7bb5 100644 --- a/test/lib/model/test_audio.py +++ b/test/lib/model/test_audio.py @@ -24,7 +24,7 @@ def test_process_audio_success(self, mock_fingerprint_file, mock_request, mock_u mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "url": "https://example.com/audio.mp3"}, model_name="audio__Model") result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) @@ -45,7 +45,7 @@ def test_process_audio_failure(self, mock_decode_fingerprint, mock_fingerprint_f mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=contents)) - audio = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", url="https://example.com/audio.mp3")) + audio = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "url": "https://example.com/audio.mp3"}, model_name="audio__Model") result = self.audio_model.process(audio) mock_request.assert_called_once_with(audio.body.url, headers={'User-Agent': 'Mozilla/5.0'}) mock_urlopen.assert_called_once_with(mock_request) diff --git a/test/lib/model/test_fasttext.py b/test/lib/model/test_fasttext.py index 1eb43b0..07770da 100644 --- a/test/lib/model/test_fasttext.py +++ b/test/lib/model/test_fasttext.py @@ -15,7 +15,7 @@ def test_respond(self, mock_fasttext_load_model, mock_hf_hub_download): mock_fasttext_load_model.return_value = self.mock_model self.mock_model.predict.return_value = (['__label__eng_Latn'], np.array([0.9])) model = FasttextModel() # Now it uses mocked functions - query = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?"))] + query = [schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, model_name="fasttext__Model")] response = model.respond(query) self.assertEqual(len(response), 1) self.assertEqual(response[0].body.hash_value, {'language': 'en', 'script': None, 'score': 0.9}) diff --git a/test/lib/model/test_fptg.py b/test/lib/model/test_fptg.py index a6789ba..f878505 100644 --- a/test/lib/model/test_fptg.py +++ b/test/lib/model/test_fptg.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, model_name="fptg__Model"), schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, model_name="fptg__Model")] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Anong pangalan mo?"}, model_name="fptg__Model") self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_generic.py b/test/lib/model/test_generic.py index c6508b2..1e168e0 100644 --- a/test/lib/model/test_generic.py +++ b/test/lib/model/test_generic.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, model_name="fptg__Model"), schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, model_name="fptg__Model")] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Anong pangalan mo?")) + query = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Anong pangalan mo?"}, model_name="fptg__Model") self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_image.py b/test/lib/model/test_image.py index 3af83dc..3e31af0 100644 --- a/test/lib/model/test_image.py +++ b/test/lib/model/test_image.py @@ -25,7 +25,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): mock_response = Mock() mock_response.read.return_value = image_content mock_urlopen.return_value = mock_response - image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body={"id": "123", "callback_url": "http://example.com?callback", "url": "http://example.com/image.jpg"}, model_name="audio__Model") result = Model().get_iobytes_for_image(image) self.assertIsInstance(result, io.BytesIO) self.assertEqual(result.read(), image_content) @@ -33,7 +33,7 @@ def test_get_iobytes_for_image(self, mock_urlopen): @patch("urllib.request.urlopen") def test_get_iobytes_for_image_raises_error(self, mock_urlopen): mock_urlopen.side_effect = URLError('test error') - image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body={"id": "123", "callback_url": "http://example.com?callback", "url": "http://example.com/image.jpg"}, model_name="audio__Model") with self.assertRaises(URLError): Model().get_iobytes_for_image(image) @@ -42,7 +42,7 @@ def test_get_iobytes_for_image_raises_error(self, mock_urlopen): def test_process(self, mock_compute_pdq, mock_get_iobytes_for_image): mock_compute_pdq.return_value = "1001" mock_get_iobytes_for_image.return_value = io.BytesIO(b"image_bytes") - image = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com?callback", url="http://example.com/image.jpg")) + image = schemas.Message(body={"id": "123", "callback_url": "http://example.com?callback", "url": "http://example.com/image.jpg"}, model_name="audio__Model") result = Model().process(image) self.assertEqual(result, "1001") diff --git a/test/lib/model/test_indian_sbert.py b/test/lib/model/test_indian_sbert.py index d4bfcb8..5104a1e 100644 --- a/test/lib/model/test_indian_sbert.py +++ b/test/lib/model/test_indian_sbert.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, model_name="indian_sbert__Model"), schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, model_name="indian_sbert__Model")] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="What is the capital of India?")) + query = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of India?"}, model_name="indian_sbert__Model") self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_meantokens.py b/test/lib/model/test_meantokens.py index f11791a..bbfa3e2 100644 --- a/test/lib/model/test_meantokens.py +++ b/test/lib/model/test_meantokens.py @@ -13,7 +13,7 @@ def setUp(self): self.mock_model = MagicMock() def test_vectorize(self): - texts = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="Hello, how are you?")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="I'm doing great, thanks!"))] + texts = [schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "Hello, how are you?"}, model_name="mean_tokens__Model"), schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "I'm doing great, thanks!"}, model_name="mean_tokens__Model")] self.model.model = self.mock_model self.model.model.encode = MagicMock(return_value=np.array([[4, 5, 6], [7, 8, 9]])) vectors = self.model.vectorize(texts) @@ -22,7 +22,7 @@ def test_vectorize(self): self.assertEqual(vectors[1], [7, 8, 9]) def test_respond(self): - query = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://example.com/callback", text="What is the capital of France?")) + query = schemas.Message(body={"id": "123", "callback_url": "http://example.com/callback", "text": "What is the capital of France?"}, model_name="mean_tokens__Model") self.model.vectorize = MagicMock(return_value=[[1, 2, 3]]) response = self.model.respond(query) self.assertEqual(len(response), 1) diff --git a/test/lib/model/test_video.py b/test/lib/model/test_video.py index 0bc4105..002a710 100644 --- a/test/lib/model/test_video.py +++ b/test/lib/model/test_video.py @@ -32,7 +32,7 @@ def test_process_video(self, mock_pathlib, mock_upload_file_to_s3, mock_hash_video_output.getPureAverageFeature.return_value = "hash_value" mock_hash_video.return_value = mock_hash_video_output mock_urlopen.return_value = MagicMock(read=MagicMock(return_value=video_contents)) - self.video_model.process(schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4"))) + self.video_model.process(schemas.Message(body={"id": "123", "callback_url": "http://blah.com?callback_id=123", "url": "http://example.com/video.mp4"}, model_name="video__Model")) mock_urlopen.assert_called_once() mock_hash_video.assert_called_once_with(ANY, "/usr/local/bin/ffmpeg") @@ -49,7 +49,7 @@ def test_tmk_program_name(self): self.assertEqual(result, "PrestoVideoEncoder") def test_respond_with_single_video(self): - video = schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video.mp4")) + video = schemas.Message(body={"id": "123", "callback_url": "http://blah.com?callback_id=123", "url": "http://example.com/video.mp4"}, model_name="video__Model") mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(video) @@ -57,7 +57,7 @@ def test_respond_with_single_video(self): self.assertEqual(result, [video]) def test_respond_with_multiple_videos(self): - videos = [schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video1.mp4")), schemas.Message(body=schemas.GenericItem(id="123", callback_url="http://blah.com?callback_id=123", url="http://example.com/video2.mp4"))] + videos = [schemas.Message(body={"id": "123", "callback_url": "http://blah.com?callback_id=123", "url": "http://example.com/video.mp4"}, model_name="video__Model"), schemas.Message(body={"id": "123", "callback_url": "http://blah.com?callback_id=123", "url": "http://example.com/video2.mp4"}, model_name="video__Model")] mock_process = MagicMock() self.video_model.process = mock_process result = self.video_model.respond(videos) diff --git a/test/lib/queue/test_processor.py b/test/lib/queue/test_processor.py index 85dc7ac..f258818 100644 --- a/test/lib/queue/test_processor.py +++ b/test/lib/queue/test_processor.py @@ -25,7 +25,7 @@ def setUp(self, mock_get_env_setting, mock_boto_resource): def test_send_callbacks(self): # Mocking necessary methods and creating fake data self.queue_processor.receive_messages = MagicMock( - return_value=[(FakeSQSMessage(receipt_handle="blah", body=json.dumps({"body": {"callback_url": "http://example.com", "text": "This is a test", "id": 1}, "response": [1,2,3]})), self.mock_input_queue)] + return_value=[(FakeSQSMessage(receipt_handle="blah", body=json.dumps({"body": {"callback_url": "http://example.com", "text": "This is a test", "id": 1, "hash_value": [1,2,3]}, "model_name": "mean_tokens__Model"})), self.mock_input_queue)] ) self.queue_processor.send_callback = MagicMock(return_value=None) self.queue_processor.delete_messages = MagicMock(return_value=None) @@ -38,7 +38,7 @@ def test_send_callbacks(self): @patch('lib.queue.processor.requests.post') def test_send_callback(self, mock_post): - message_body = schemas.Message(body={"callback_url": "http://example.com", "text": "This is a test", "id": 123}, response=[1,2,3]) + message_body = schemas.Message(body={"callback_url": "http://example.com", "text": "This is a test", "id": 123, "hash_value": [1,2,3]}, model_name="mean_tokens__Model") self.queue_processor.send_callback(message_body) mock_post.assert_called_once_with("http://example.com", json=message_body) @@ -46,7 +46,7 @@ def test_send_callback(self, mock_post): @patch('lib.queue.processor.requests.post') def test_send_callback_failure(self, mock_post): mock_post.side_effect = Exception("Request Failed!") - message_body = schemas.Message(body={"callback_url": "http://example.com", "text": "This is a test", "id": 123}, response=[1,2,3]) + message_body = schemas.Message(body={"callback_url": "http://example.com", "text": "This is a test", "id": 123, "hash_value": [1,2,3]}, model_name="mean_tokens__Model") with self.assertLogs(level='ERROR') as cm: self.queue_processor.send_callback(message_body) self.assertIn("Failed with Request Failed! on http://example.com with message of", cm.output[0]) diff --git a/test/lib/queue/test_queue.py b/test/lib/queue/test_queue.py index 79f4903..a0e38c5 100644 --- a/test/lib/queue/test_queue.py +++ b/test/lib/queue/test_queue.py @@ -100,11 +100,11 @@ def test_delete_messages_from_queue(self, mock_logger): mock_logger.assert_called_with(f"Deleting message: {mock_messages[-1]}") def test_push_message(self): - message_to_push = schemas.Message(body={"id": 1, "callback_url": "http://example.com", "text": "This is a test"}) + message_to_push = schemas.Message(body={"id": 1, "callback_url": "http://example.com", "text": "This is a test"}, model_name="mean_tokens__Model") # Call push_message returned_message = self.queue.push_message(self.queue_name_output, message_to_push) # Check if the message was correctly serialized and sent - self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}, "hash_value": null, "bucket": null, "outfile": null}}') + self.mock_output_queue.send_message.assert_called_once_with(MessageBody='{"body": {"id": "1", "callback_url": "http://example.com", "url": null, "text": "This is a test", "raw": {}, "hash_value": null}, "model_name": "mean_tokens__Model"}') self.assertEqual(returned_message, message_to_push) diff --git a/test/lib/test_http.py b/test/lib/test_http.py index 773d61f..c9d45d9 100644 --- a/test/lib/test_http.py +++ b/test/lib/test_http.py @@ -17,10 +17,10 @@ def test_process_item(self, mock_push_message, mock_create): test_data = {"id": 1, "callback_url": "http://example.com", "text": "This is a test"} - response = self.client.post("/process_item/test_process", json=test_data) - mock_create.assert_called_once_with("test_process") + response = self.client.post("/process_item/fptg__Model", json=test_data) + mock_create.assert_called_once_with("fptg__Model") self.assertEqual(response.status_code, 200) - self.assertEqual(response.json(), {"message": "Message pushed successfully", "queue": "test_process", "body": test_data}) + self.assertEqual(response.json(), {"message": "Message pushed successfully", "queue": "fptg__Model", "body": test_data}) @patch('lib.http.post_url') From 6165fdcc601abcfa0625772dc0c41f45e87bb757 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Wed, 8 Nov 2023 13:32:02 -0800 Subject: [PATCH 9/9] fix model reference --- lib/queue/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/queue/worker.py b/lib/queue/worker.py index 3b0f2df..b45bbcb 100644 --- a/lib/queue/worker.py +++ b/lib/queue/worker.py @@ -52,7 +52,7 @@ def safely_respond(self, model: Model) -> List[schemas.Message]: if messages_with_queues: logger.debug(f"About to respond to: ({messages_with_queues})") try: - responses = model.respond([schemas.Message(**{**json.loads(message.body), **{"model_name": self.model.model_name}}) for message, queue in messages_with_queues]) + responses = model.respond([schemas.Message(**{**json.loads(message.body), **{"model_name": model.model_name}}) for message, queue in messages_with_queues]) except Exception as e: logger.error(e) self.delete_messages(messages_with_queues)