Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cv2 3833 compute and save sscd embeddings for images #47

Open
wants to merge 28 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
cae6e6e
adding `self,` to `def compute_pdq(self, iobytes: io.BytesIO) -> str:`
ahmednasserswe Oct 30, 2023
2dad0b6
create `image_sscd.py` and corresponding changes in Dockerfile
ahmednasserswe Oct 30, 2023
3c3fc9f
removing `--extra-index-url https://download.pytorch.org/whl/cu113` f…
ahmednasserswe Nov 8, 2023
8224cfb
installing sscd requirements directly in Dockerfile
ahmednasserswe Nov 8, 2023
b22b803
changing `pytorch-lightning` version to `1.5.10`
ahmednasserswe Nov 8, 2023
1797637
trying to build with sscd installation commented in dockerfile
ahmednasserswe Nov 8, 2023
295e0b8
Changing `Model.compute_pdq(io.BytesIO(image_content))` to `result = …
ahmednasserswe Nov 9, 2023
40e5bcf
uncommenting lines to install sscd-copy-detection in Dockerfile
ahmednasserswe Nov 9, 2023
bcc77e3
installing sscd requirements directly in dockerfile instead from requ…
ahmednasserswe Nov 10, 2023
c4bb746
Comment git clone line - Update Dockerfile
computermacgyver Nov 10, 2023
1350858
Move packages to requirements.txt, remove possibly unneeded ones
Nov 10, 2023
4049828
add back model download
Nov 10, 2023
fef66c4
Remove possibly unused requirements
Nov 11, 2023
11fd99c
Merge branch 'master' into CV2-3833-Compute-and-save-SSCD-embeddings-…
Nov 11, 2023
5372d57
Large refactor
Nov 11, 2023
4713a35
Adding missing files
Nov 11, 2023
1e9227b
Move SSCD model download to __init__
Nov 11, 2023
d702e40
fix typo
Nov 11, 2023
3449286
Revert comments to lib/queue/worker.py
Nov 11, 2023
690d606
Revert comments to lib/queue/worker.py
Nov 11, 2023
2a4e158
update import in test
Nov 14, 2023
a799f21
changing `from lib.model.image import Model` to `from lib.model.image…
ahmednasserswe Nov 20, 2023
998d05f
adding `test_image_sscd.py` and `img/presto_flowchart.jpg`
ahmednasserswe Nov 20, 2023
eb5f6f5
Merge branch 'CV2-3833-Compute-and-save-SSCD-embeddings-for-images' o…
ahmednasserswe Nov 20, 2023
d3dc05f
Use numpy.allclose to accomodate OS/chipset differences
Nov 20, 2023
cd191d8
fix test
Nov 29, 2023
5697bee
Update image_sscd.py with comments describing of normalization and re…
ahmednasserswe Nov 29, 2023
c433a0e
drop file
DGaffney Dec 18, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env_file
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,7 @@ PRESTO_PORT=8000
DEPLOY_ENV=local
# MODEL_NAME=mean_tokens.Model
MODEL_NAME=audio.Model
# MODEL_NAME=image_sscd.Model
# MODEL_NAME=image_pdq.Model
AWS_ACCESS_KEY_ID=SOMETHING
AWS_SECRET_ACCESS_KEY=OTHERTHING
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.cpython-39.pyc
*.pyc
sscd_disc_mixup.torchscript.pt
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,6 @@ RUN pip install transformers
RUN pip install pact-python
RUN pip install --no-cache-dir -r requirements.txt
RUN cd threatexchange/pdq/python && pip install .

COPY . .
CMD ["make", "run"]
CMD ["make", "run"]
27 changes: 27 additions & 0 deletions lib/model/generic_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from lib.model.model import Model

from lib import schemas
import urllib.request
import io

class GenericImageModel(Model):

def get_iobytes_for_image(self, image: schemas.Message) -> io.BytesIO:
"""
Read file as bytes after requesting based on URL.
"""
return io.BytesIO(
urllib.request.urlopen(
urllib.request.Request(
image.body.url,
headers={'User-Agent': 'Mozilla/5.0'}
)
).read()
)

def process(self, image: schemas.Message) -> schemas.GenericItem:
"""
Generic function for returning the actual response.
"""

return self.compute_imagehash(self.get_iobytes_for_image(image))
37 changes: 0 additions & 37 deletions lib/model/image.py

This file was deleted.

20 changes: 20 additions & 0 deletions lib/model/image_pdq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from typing import Dict
import io

from lib.model.generic_image import GenericImageModel

from pdqhashing.hasher.pdq_hasher import PDQHasher
from lib import schemas

class Model(GenericImageModel):
def compute_pdq(self, iobytes: io.BytesIO) -> str:
"""Compute perceptual hash using ImageHash library
:param im: Numpy.ndarray
:returns: Imagehash.ImageHash
"""
pdq_hasher = PDQHasher()
hash_and_qual = pdq_hasher.fromBufferedImage(iobytes)
return hash_and_qual.getHash().dumpBitsFlat()

def compute_imagehash(self, iobytes: io.BytesIO) -> str:
return self.compute_pdq(iobytes)
52 changes: 52 additions & 0 deletions lib/model/image_sscd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from typing import Dict
import io

from lib.model.generic_image import GenericImageModel
from lib import schemas
from torchvision import transforms
import torch
from lib.logger import logger
import numpy as np
from PIL import Image
import urllib.request

class Model(GenericImageModel):
def __init__(self):
super().__init__()
#FIXME: Load from a Meedan S3 bucket
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is still to do

try:
self.model = torch.jit.load("sscd_disc_mixup.torchscript.pt")
except:
logger.info("Downloading SSCD model...")
m=urllib.request.urlopen("https://dl.fbaipublicfiles.com/sscd-copy-detection/sscd_disc_mixup.torchscript.pt").read()
with open("sscd_disc_mixup.torchscript.pt","wb") as fh:
fh.write(m)
self.model = torch.jit.load("sscd_disc_mixup.torchscript.pt")
logger.info("SSCD model loaded")

def compute_sscd(self, iobytes: io.BytesIO) -> str:
"""Compute perceptual hash using ImageHash library
:param im: Numpy.ndarray #FIXME
:returns: Imagehash.ImageHash #FIXME
"""
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225],
)
small_288 = transforms.Compose([
ahmednasserswe marked this conversation as resolved.
Show resolved Hide resolved
transforms.Resize(288),
transforms.ToTensor(),
normalize,
])
skew_320 = transforms.Compose([
transforms.Resize([320, 320]),
transforms.ToTensor(),
normalize,
])

image = Image.open(iobytes)
batch = small_288(image).unsqueeze(0)
embedding = self.model(batch)[0, :]
return np.asarray(embedding.detach().numpy()).tolist()

def compute_imagehash(self, iobytes: io.BytesIO) -> str:
return self.compute_sscd(iobytes)
10 changes: 6 additions & 4 deletions lib/queue/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,12 @@ def safely_respond(self, model: Model) -> List[schemas.Message]:
responses = []
if messages_with_queues:
logger.debug(f"About to respond to: ({messages_with_queues})")
try:
responses = model.respond([schemas.Message(**{**json.loads(message.body), **{"model_name": model.model_name}}) for message, queue in messages_with_queues])
except Exception as e:
logger.error(e)
#try:
responses = model.respond([schemas.Message(**{**json.loads(message.body), **{"model_name": model.model_name}}) for message, queue in messages_with_queues])
logger.info("!!!!")
logger.info(responses)
#except Exception as e:
# logger.error(e)
self.delete_messages(messages_with_queues)
return responses

6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,8 @@ fasttext==0.9.2
langcodes==3.3.0
requests==2.31.0
pytest==7.4.0
sentry-sdk==1.30.0
sentry-sdk==1.30.0
pytorch-lightning==1.5.10
lightning-bolts==0.4.0
torch==1.9.0
torchvision==0.10.0
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_compute_pdq(self, mock_pdq_hasher):
image_content = file.read()
mock_hasher_instance = mock_pdq_hasher.return_value
mock_hasher_instance.fromBufferedImage.return_value.getHash.return_value.dumpBitsFlat.return_value = '1001'
result = Model.compute_pdq(io.BytesIO(image_content))
result = Model().compute_pdq(io.BytesIO(image_content))
self.assertEqual(result, '0011100000111011010110100001001110001011110100100010101011010111010110101010000111001010111000001010111111110000000101110010000011111110111110100100011111010010110110101111101100111001000000010010100101010111110001001101101011000110001000001110010000111100')

@patch("urllib.request.urlopen")
Expand Down
Loading