Upgrade NLTK and Pillow packages and change base image to 36

marqo-ai · Oct 10, 2024 · 0a83a6a · 0a83a6a
1 parent 2d87f19
commit 0a83a6a
Show file tree

Hide file tree

Showing 6 changed files with 223 additions and 43 deletions.
diff --git a/.github/workflows/cuda_docker_marqo.yml b/.github/workflows/cuda_docker_marqo.yml
@@ -66,11 +66,11 @@ jobs:
     name: Run CUDA Docker Marqo API Tests
     needs: Start-Runner # required to start the main job when the runner is ready
     runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
-                
-    environment: marqo-test-suite 
-    
+
+    environment: marqo-test-suite
+
     steps:
-       
+
       - name: Checkout marqo repo
         uses: actions/checkout@v3
         with:
@@ -81,13 +81,13 @@ jobs:
         with:
           python-version: "3.8"
           cache: "pip"
-          
+
       - name: Install Dependencies
         run: |
           #pip install -r requirements.txt
           pip install tox==3.26
           pip install flake8
-      
+
       - name: Set MQ_PY_MARQO_BRANCH variable
         run: |
           if [[ "${{ inputs.py_marqo_branch }}" == "marqo" ]]; then
@@ -97,21 +97,21 @@ jobs:
           else
             echo "MQ_PY_MARQO_BRANCH=git+https://github.com/marqo-ai/py-marqo.git@${{ inputs.py_marqo_branch }}" >> $GITHUB_ENV
           fi
-  
+
       - name: Checkout marqo-api-tests repo
         uses: actions/checkout@v3
         with:
           repository: marqo-ai/marqo-api-tests
           ref: ${{ github.event.inputs.api_tests_branch }}
-          
+
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2
-          
+
       - name: Set up Environment
         run: |
           # Set up conf file
           echo 'export MARQO_API_TESTS_ROOT="${{ github.workspace }}"' >> conf
-          
+
       - name: Run CUDA Integration Tests - CUDA Docker Marqo
         run: |
           export MQ_API_TEST_BRANCH="${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}"

diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ COPY vespa .
 RUN mvn clean package
 
 # Stage 2: Base image for Python setup
-FROM marqoai/marqo-base:30 as base_image
+FROM marqoai/marqo-base:36 as base_image
 
 # Allow mounting volume containing data and configs for vespa
 VOLUME /opt/vespa/var

diff --git a/requirements.dev.txt b/requirements.dev.txt
@@ -23,10 +23,10 @@ huggingface-hub==0.25.0
 more_itertools
 boto3==1.25.4
 botocore==1.28.4
-nltk==3.7
+nltk==3.9.1
 torch==1.12.1
 torchvision==0.13.1
-Pillow==9.3.0
+Pillow==10.4.0
 numpy==1.23.4
 validators==0.20.0
 sentence-transformers==2.2.2

diff --git a/src/marqo/s2_inference/processing/text.py b/src/marqo/s2_inference/processing/text.py
@@ -30,6 +30,12 @@ def _splitting_functions(split_by: str, language: str='english') -> FunctionType
     except LookupError:
         nltk.download("punkt")
 
+    # Punkt_tab needs to be downloaded after NLTK 3.8 and later
+    try:
+        nltk.data.find("tokenizers/punkt_tab")
+    except LookupError:
+        nltk.download("punkt_tab")
+
     MAPPING = {
         'character':list,
         'word': partial(word_tokenize, language=language),

diff --git a/tests/s2_inference/test_encoding.py b/tests/s2_inference/test_encoding.py
@@ -26,7 +26,8 @@ def tearDown(self) -> None:
         clear_loaded_models()
 
     def test_vectorize(self):
-        names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
+        names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32",
+                 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
                  "all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6",
                  "hf/bge-small-en-v1.5", "onnx/all-MiniLM-L6-v1", "onnx/all_datasets_v4_MiniLM-L6"]
 
@@ -38,7 +39,7 @@ def test_vectorize(self):
         names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"]
 
         names = names + names_e5 + names_bge + names_snowflake
-                 
+
         sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
         device = 'cpu'
         eps = 1e-9
@@ -58,6 +59,40 @@ def test_vectorize(self):
 
             clear_loaded_models()
 
+    def test_vectorize_normalise(self):
+        open_clip_names = ["open_clip/ViT-B-32/laion2b_s34b_b79k"]
+
+        names_bge = ["hf/bge-small-en-v1.5", "hf/bge-base-en-v1.5"]
+
+        names_snowflake = ["hf/snowflake-arctic-embed-m", "hf/snowflake-arctic-embed-m-v1.5"]
+
+        names = open_clip_names + names_bge + names_snowflake
+
+        sentences = ['hello', 'this is a test sentence. so is this.', ['hello', 'this is a test sentence. so is this.']]
+        device = 'cpu'
+        eps = 1e-9
+
+        for name in names:
+            model_properties = get_model_properties_from_registry(name)
+            model = _load_model(model_properties['name'], model_properties=model_properties, device=device)
+
+            for sentence in sentences:
+                output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
+                assert _check_output_type(output_v)
+                output_m = model.encode(sentence, normalize=True)
+                assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
+                for vector in output_v:
+                    assert abs(np.linalg.norm(np.array(vector)) - 1) < 1e-5
+
+                output_v_unnormalised = vectorise(name, sentence, model_properties, device, normalize_embeddings=False)
+                assert _check_output_type(output_v)
+                output_m_unnormalised = model.encode(sentence, normalize=False)
+                assert abs(torch.FloatTensor(output_v_unnormalised) - torch.FloatTensor(output_m_unnormalised)).sum() < eps
+                for vector in output_v_unnormalised:
+                    assert abs(np.linalg.norm(np.array(vector)) - 1) > 1e-5
+
+            clear_loaded_models()
+
     def test_cpu_encode_type(self):
         names = ["fp16/ViT-B/32", "onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32',
                  "all-MiniLM-L6-v1", "all_datasets_v4_MiniLM-L6", "hf/all-MiniLM-L6-v1", "hf/all_datasets_v4_MiniLM-L6",
@@ -252,29 +287,6 @@ def test_model_un_normalization(self):
 
             clear_loaded_models()
 
-    def test_onnx_clip_vectorise(self):
-        names = ["onnx16/open_clip/ViT-B-32/laion400m_e32", 'onnx32/open_clip/ViT-B-32-quickgelu/laion400m_e32']
-
-        sentences = ['hello', 'this is a test sentence. so is this.',
-                     ['hello', 'this is a test sentence. so is this.']]
-        device = 'cpu'
-        eps = 1e-9
-
-        for name in names:
-            model_properties = get_model_properties_from_registry(name)
-            model = _load_model(model_properties['name'], model_properties=model_properties, device=device)
-
-            for sentence in sentences:
-                output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
-
-                assert _check_output_type(output_v)
-
-                output_m = model.encode(sentence, normalize=True)
-
-                assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
-
-            clear_loaded_models()
-
 
 class TestOpenClipModelEncoding(unittest.TestCase):
     '''
@@ -307,13 +319,15 @@ def test_open_clip_vectorize(self):
             model = _load_model(model_properties['name'], model_properties=model_properties, device=device)
 
             for sentence in sentences:
-                output_v = vectorise(name, sentence, model_properties, device, normalize_embeddings=True)
+                for normalize_embeddings in [True, False]:
+                    output_v = vectorise(name, sentence, model_properties, device,
+                                         normalize_embeddings=normalize_embeddings)
 
-                assert _check_output_type(output_v)
+                    assert _check_output_type(output_v)
 
-                output_m = model.encode(sentence, normalize=True)
+                    output_m = model.encode(sentence, normalize=normalize_embeddings)
 
-                assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
+                    assert abs(torch.FloatTensor(output_m) - torch.FloatTensor(output_v)).sum() < eps
 
             clear_loaded_models()