Skip to content

Commit

Permalink
Upgrade NLTK and Pillow packages and change base image to 36
Browse files Browse the repository at this point in the history
  • Loading branch information
wanliAlex authored Oct 10, 2024
1 parent 2d87f19 commit 0a83a6a
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 43 deletions.
20 changes: 10 additions & 10 deletions .github/workflows/cuda_docker_marqo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ jobs:
name: Run CUDA Docker Marqo API Tests
needs: Start-Runner # required to start the main job when the runner is ready
runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
environment: marqo-test-suite

environment: marqo-test-suite

steps:

- name: Checkout marqo repo
uses: actions/checkout@v3
with:
Expand All @@ -81,13 +81,13 @@ jobs:
with:
python-version: "3.8"
cache: "pip"

- name: Install Dependencies
run: |
#pip install -r requirements.txt
pip install tox==3.26
pip install flake8
- name: Set MQ_PY_MARQO_BRANCH variable
run: |
if [[ "${{ inputs.py_marqo_branch }}" == "marqo" ]]; then
Expand All @@ -97,21 +97,21 @@ jobs:
else
echo "MQ_PY_MARQO_BRANCH=git+https://github.com/marqo-ai/py-marqo.git@${{ inputs.py_marqo_branch }}" >> $GITHUB_ENV
fi
- name: Checkout marqo-api-tests repo
uses: actions/checkout@v3
with:
repository: marqo-ai/marqo-api-tests
ref: ${{ github.event.inputs.api_tests_branch }}

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Set up Environment
run: |
# Set up conf file
echo 'export MARQO_API_TESTS_ROOT="${{ github.workspace }}"' >> conf
- name: Run CUDA Integration Tests - CUDA Docker Marqo
run: |
export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}"
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ COPY vespa .
RUN mvn clean package

# Stage 2: Base image for Python setup
FROM marqoai/marqo-base:30 as base_image
FROM marqoai/marqo-base:36 as base_image

# Allow mounting volume containing data and configs for vespa
VOLUME /opt/vespa/var
Expand Down
4 changes: 2 additions & 2 deletions requirements.dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ huggingface-hub==0.25.0
more_itertools
boto3==1.25.4
botocore==1.28.4
nltk==3.7
nltk==3.9.1
torch==1.12.1
torchvision==0.13.1
Pillow==9.3.0
Pillow==10.4.0
numpy==1.23.4
validators==0.20.0
sentence-transformers==2.2.2
Expand Down
6 changes: 6 additions & 0 deletions src/marqo/s2_inference/processing/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ def _splitting_functions(split_by: str, language: str='english') -> FunctionType
except LookupError:
nltk.download("punkt")

# Punkt_tab needs to be downloaded after NLTK 3.8 and later
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab")

MAPPING = {
'character':list,
'word': partial(word_tokenize, language=language),
Expand Down
72 changes: 43 additions & 29 deletions tests/s2_inference/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def tearDown(self) -> None:
clear_loaded_models()

def test_vectorize(self):
names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32",
'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
"all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6",
"hf/bge-small-en-v1.5", "onnx/all-MiniLM-L6-v1", "onnx/all_datasets_v4_MiniLM-L6"]

Expand All @@ -38,7 +39,7 @@ def test_vectorize(self):
names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"]

names = names + names_e5 + names_bge + names_snowflake

sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
device = 'cpu'
eps = 1e-9
Expand All @@ -58,6 +59,40 @@ def test_vectorize(self):

clear_loaded_models()

def test_vectorize_normalise(self):
open_clip_names = ["open_clip/ViT-B-32/laion2b_s34b_b79k"]

names_bge = ["hf/bge-small-en-v1.5", "hf/bge-base-en-v1.5"]

names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"]

names = open_clip_names + names_bge + names_snowflake

sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
device = 'cpu'
eps = 1e-9

for name in names:
model_properties = get_model_properties_from_registry(name)
model = _load_model(model_properties['name'], model_properties=model_properties, device=device)

for sentence in sentences:
output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
assert _check_output_type(output_v)
output_m = model.encode(sentence, normalize=True)
assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
for vector in output_v:
assert abs(np.linalg.norm(np.array(vector)) - 1) < 1e-5

output_v_unnormalised = vectorise(name, sentence, model_properties, device, normalize_embeddings=False)
assert _check_output_type(output_v)
output_m_unnormalised = model.encode(sentence, normalize=False)
assert abs(torch.FloatTensor(output_v_unnormalised) - torch.FloatTensor(output_m_unnormalised)).sum() < eps
for vector in output_v_unnormalised:
assert abs(np.linalg.norm(np.array(vector)) - 1) > 1e-5

clear_loaded_models()

def test_cpu_encode_type(self):
names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
"all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6",
Expand Down Expand Up @@ -252,29 +287,6 @@ def test_model_un_normalization(self):

clear_loaded_models()

def test_onnx_clip_vectorise(self):
names = ["onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32']

sentences = ['hello', 'this is a test sentence. so is this.',
['hello', 'this is a test sentence. so is this.']]
device = 'cpu'
eps = 1e-9

for name in names:
model_properties = get_model_properties_from_registry(name)
model = _load_model(model_properties['name'], model_properties=model_properties, device=device)

for sentence in sentences:
output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)

assert _check_output_type(output_v)

output_m = model.encode(sentence, normalize=True)

assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps

clear_loaded_models()


class TestOpenClipModelEncoding(unittest.TestCase):
'''
Expand Down Expand Up @@ -307,13 +319,15 @@ def test_open_clip_vectorize(self):
model = _load_model(model_properties['name'], model_properties=model_properties, device=device)

for sentence in sentences:
output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
for normalize_embeddings in [True, False]:
output_v = vectorise(name, sentence, model_properties, device,
normalize_embeddings=normalize_embeddings)

assert _check_output_type(output_v)
assert _check_output_type(output_v)

output_m = model.encode(sentence, normalize=True)
output_m = model.encode(sentence, normalize=normalize_embeddings)

assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps

clear_loaded_models()

Expand Down
Loading

0 comments on commit 0a83a6a

Please sign in to comment.