From 1609890446a7f9859dd1b4c70b262860b1676afe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Tue, 19 Nov 2024 14:42:20 +0100
Subject: [PATCH] fix: meilisearch document deletion

In production, we sometimes observe the following error on document
deletion:

    	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/forum/handlers.py", line 115, in handle_deletion
	lms-1  |     search_backend.delete_document(sender.index_name, document_id)
	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/forum/search/meilisearch.py", line 93, in delete_document
	lms-1  |     doc_pk = m.id2pk(doc_id)
	lms-1  |              ^^^^^^^^^^^^^^^
	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/search/meilisearch.py", line 362, in id2pk
	lms-1  |     return hashlib.sha1(value.encode()).hexdigest()
	lms-1  |                         ^^^^^^^^^^^^
	lms-1  | AttributeError: 'int' object has no attribute 'encode'

This is due to the fact that the doc_id passed to the delete_document
method is sometimes an integer, sometimes a str. To reflect that, we
updated the base function signature. We also improved the unit test
coverage. Not all functions are covered yet, but it's getting better.

Note that to reproduce this issue we should use MySQL as a storage
backend.
---
 forum/search/base.py          |  6 +--
 forum/search/es.py            |  6 +--
 forum/search/meilisearch.py   | 24 ++++-------
 test_utils/mock_es_backend.py |  6 +--
 tests/test_meilisearch.py     | 78 +++++++++++++++++++++++++++++++++++
 5 files changed, 96 insertions(+), 24 deletions(-)
 create mode 100644 tests/test_meilisearch.py

diff --git a/forum/search/base.py b/forum/search/base.py
index fcc6e70..e65f561 100644
--- a/forum/search/base.py
+++ b/forum/search/base.py
@@ -15,16 +15,16 @@ class BaseDocumentSearchBackend:
     """
 
     def index_document(
-        self, index_name: str, doc_id: str, document: dict[str, t.Any]
+        self, index_name: str, doc_id: str | int, document: dict[str, t.Any]
     ) -> None:
         raise NotImplementedError
 
     def update_document(
-        self, index_name: str, doc_id: str, update_data: dict[str, t.Any]
+        self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any]
     ) -> None:
         raise NotImplementedError
 
-    def delete_document(self, index_name: str, doc_id: str) -> None:
+    def delete_document(self, index_name: str, doc_id: str | int) -> None:
         raise NotImplementedError
 
 
diff --git a/forum/search/es.py b/forum/search/es.py
index 24eb67c..890f535 100644
--- a/forum/search/es.py
+++ b/forum/search/es.py
@@ -80,7 +80,7 @@ class ElasticsearchDocumentBackend(
     """
 
     def update_document(
-        self, index_name: str, doc_id: str, update_data: dict[str, Any]
+        self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
     ) -> None:
         """
         Update a single document in the specified index.
@@ -98,7 +98,7 @@ def update_document(
         except exceptions.RequestError as e:
             log.error(f"Error updating document {doc_id} in index {index_name}: {e}")
 
-    def delete_document(self, index_name: str, doc_id: str) -> None:
+    def delete_document(self, index_name: str, doc_id: str | int) -> None:
         """
         Delete a single document from the specified index.
 
@@ -115,7 +115,7 @@ def delete_document(self, index_name: str, doc_id: str) -> None:
             log.error(f"Error deleting document {doc_id} from index {index_name}: {e}")
 
     def index_document(
-        self, index_name: str, doc_id: str, document: dict[str, Any]
+        self, index_name: str, doc_id: str | int, document: dict[str, Any]
     ) -> None:
         """
         Index a single document in the specified Elasticsearch index.
diff --git a/forum/search/meilisearch.py b/forum/search/meilisearch.py
index 3338cf2..aea8fc7 100644
--- a/forum/search/meilisearch.py
+++ b/forum/search/meilisearch.py
@@ -48,15 +48,14 @@ def create_document(document: dict[str, t.Any], doc_id: str) -> dict[str, t.Any]
     """
     We index small documents in Meilisearch, with just a handful of fields.
     """
-    # THE CAKE IS A LIE. Sometimes the doc_id is an integer.
-    doc_id = str(doc_id)
     processed = {"id": doc_id, m.PRIMARY_KEY_FIELD_NAME: m.id2pk(doc_id)}
     for field in ALL_FIELDS:
         if field in document:
             processed[field] = document[field]
     # We remove html markup, which breaks search in some places. For instance
     # "<p>Word" will not match "Word", which is a shame.
-    processed["body"] = BeautifulSoup(processed["body"]).get_text()
+    if body := processed.get("body"):
+        processed["body"] = BeautifulSoup(body, features="html.parser").get_text()
     return processed
 
 
@@ -68,29 +67,29 @@ class MeilisearchDocumentBackend(
     """
 
     def index_document(
-        self, index_name: str, doc_id: str, document: dict[str, t.Any]
+        self, index_name: str, doc_id: str | int, document: dict[str, t.Any]
     ) -> None:
         """
         Insert a single document in the Meilisearch index.
         """
         meilisearch_index = self.get_index(index_name)
-        processed = create_document(document, doc_id)
+        processed = create_document(document, str(doc_id))
         meilisearch_index.add_documents([processed])
 
     def update_document(
-        self, index_name: str, doc_id: str, update_data: dict[str, t.Any]
+        self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any]
     ) -> None:
         """
         Updating is the same as inserting in meilisearch
         """
         return self.index_document(index_name, doc_id, update_data)
 
-    def delete_document(self, index_name: str, doc_id: str) -> None:
+    def delete_document(self, index_name: str, doc_id: str | int) -> None:
         """
         Delete a single document, identified by its ID.
         """
         meilisearch_index = self.get_index(index_name)
-        doc_pk = m.id2pk(doc_id)
+        doc_pk = m.id2pk(str(doc_id))
         meilisearch_index.delete_document(doc_pk)
 
 
@@ -100,12 +99,7 @@ class MeilisearchIndexBackend(base.BaseIndexSearchBackend, MeilisearchClientMixi
     """
 
     def initialize_indices(self, force_new_index: bool = False) -> None:
-        filterable_fields = [
-            m.PRIMARY_KEY_FIELD_NAME,
-            "context",
-            "course_id",
-            "commentable_id",
-        ]
+        filterable_fields = [m.PRIMARY_KEY_FIELD_NAME] + FILTERABLE_FIELDS
         index_filterables = {
             Model.index_name: filterable_fields for Model in MODEL_INDICES
         }
@@ -132,7 +126,7 @@ def rebuild_indices(
             for page_number in paginator.page_range:
                 page = paginator.get_page(page_number)
                 documents = [
-                    create_document(obj.doc_to_hash(), obj.id)
+                    create_document(obj.doc_to_hash(), str(obj.id))
                     for obj in page.object_list
                 ]
                 if documents:
diff --git a/test_utils/mock_es_backend.py b/test_utils/mock_es_backend.py
index ae01351..781d50f 100644
--- a/test_utils/mock_es_backend.py
+++ b/test_utils/mock_es_backend.py
@@ -51,14 +51,14 @@ class MockElasticsearchDocumentBackend(ElasticsearchDocumentBackend):
     """
 
     def update_document(
-        self, index_name: str, doc_id: str, update_data: dict[str, Any]
+        self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
     ) -> None:
         """Mock method for updating a document in Elasticsearch."""
 
-    def delete_document(self, index_name: str, doc_id: str) -> None:
+    def delete_document(self, index_name: str, doc_id: str | int) -> None:
         """Mock method for deleting a document from Elasticsearch."""
 
     def index_document(
-        self, index_name: str, doc_id: str, document: dict[str, Any]
+        self, index_name: str, doc_id: str | int, document: dict[str, Any]
     ) -> None:
         """Mock method for indexing a document in Elasticsearch."""
diff --git a/tests/test_meilisearch.py b/tests/test_meilisearch.py
new file mode 100644
index 0000000..20672e9
--- /dev/null
+++ b/tests/test_meilisearch.py
@@ -0,0 +1,78 @@
+"""
+Unit tests for the meilisearch search backend.
+"""
+
+from unittest.mock import patch, Mock
+
+import search.meilisearch as m
+from forum.search import meilisearch
+
+TEST_ID = "abcd"
+TEST_PK = m.id2pk(TEST_ID)
+
+
+def test_create_document() -> None:
+    assert {
+        "id": TEST_ID,
+        m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
+    } == meilisearch.create_document({}, TEST_ID)
+
+    assert {
+        "id": TEST_ID,
+        m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
+        "body": "Somebody",
+    } == meilisearch.create_document({"body": "Somebody"}, TEST_ID)
+
+    assert {
+        "id": TEST_ID,
+        m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
+        "course_id": "some/course/id",
+    } == meilisearch.create_document({"course_id": "some/course/id"}, TEST_ID)
+
+    assert {
+        "id": TEST_ID,
+        m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
+    } == meilisearch.create_document(
+        {"field_should_not_be_here": "some_value"}, TEST_ID
+    )
+
+    assert {
+        "id": TEST_ID,
+        m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
+        "body": "Somebody",
+    } == meilisearch.create_document({"body": "<p>Somebody</p>"}, TEST_ID)
+
+
+def test_index_document() -> None:
+    backend = meilisearch.MeilisearchDocumentBackend()
+    with patch.object(
+        backend, "get_index", return_value=Mock(add_documents=Mock())
+    ) as mock_get_index:
+        backend.index_document(
+            "my_index",
+            TEST_ID,
+            {
+                "body": "<p>Some body</p>",
+                "some other field": "some value",
+            },
+        )
+        mock_get_index.assert_called_once_with("my_index")
+        mock_get_index().add_documents.assert_called_once_with(
+            [
+                {
+                    "id": TEST_ID,
+                    "_pk": TEST_PK,
+                    "body": "Some body",
+                }
+            ]
+        )
+
+
+def test_delete_document() -> None:
+    backend = meilisearch.MeilisearchDocumentBackend()
+    with patch.object(
+        backend, "get_index", return_value=Mock(add_documents=Mock())
+    ) as mock_get_index:
+        backend.delete_document("my_index", TEST_ID)
+        mock_get_index.assert_called_once_with("my_index")
+        mock_get_index().delete_document.assert_called_once_with(TEST_PK)