From 1609890446a7f9859dd1b4c70b262860b1676afe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com> Date: Tue, 19 Nov 2024 14:42:20 +0100 Subject: [PATCH] fix: meilisearch document deletion In production, we sometimes observe the following error on document deletion: lms-1 | File "/openedx/venv/lib/python3.11/site-packages/forum/handlers.py", line 115, in handle_deletion lms-1 | search_backend.delete_document(sender.index_name, document_id) lms-1 | File "/openedx/venv/lib/python3.11/site-packages/forum/search/meilisearch.py", line 93, in delete_document lms-1 | doc_pk = m.id2pk(doc_id) lms-1 | ^^^^^^^^^^^^^^^ lms-1 | File "/openedx/venv/lib/python3.11/site-packages/search/meilisearch.py", line 362, in id2pk lms-1 | return hashlib.sha1(value.encode()).hexdigest() lms-1 | ^^^^^^^^^^^^ lms-1 | AttributeError: 'int' object has no attribute 'encode' This is due to the fact that the doc_id passed to the delete_document method is sometimes an integer, sometimes a str. To reflect that, we updated the base function signature. We also improved the unit test coverage. Not all functions are covered yet, but it's getting better. Note that to reproduce this issue we should use MySQL as a storage backend. --- forum/search/base.py | 6 +-- forum/search/es.py | 6 +-- forum/search/meilisearch.py | 24 ++++------- test_utils/mock_es_backend.py | 6 +-- tests/test_meilisearch.py | 78 +++++++++++++++++++++++++++++++++++ 5 files changed, 96 insertions(+), 24 deletions(-) create mode 100644 tests/test_meilisearch.py diff --git a/forum/search/base.py b/forum/search/base.py index fcc6e70..e65f561 100644 --- a/forum/search/base.py +++ b/forum/search/base.py @@ -15,16 +15,16 @@ class BaseDocumentSearchBackend: """ def index_document( - self, index_name: str, doc_id: str, document: dict[str, t.Any] + self, index_name: str, doc_id: str | int, document: dict[str, t.Any] ) -> None: raise NotImplementedError def update_document( - self, index_name: str, doc_id: str, update_data: dict[str, t.Any] + self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any] ) -> None: raise NotImplementedError - def delete_document(self, index_name: str, doc_id: str) -> None: + def delete_document(self, index_name: str, doc_id: str | int) -> None: raise NotImplementedError diff --git a/forum/search/es.py b/forum/search/es.py index 24eb67c..890f535 100644 --- a/forum/search/es.py +++ b/forum/search/es.py @@ -80,7 +80,7 @@ class ElasticsearchDocumentBackend( """ def update_document( - self, index_name: str, doc_id: str, update_data: dict[str, Any] + self, index_name: str, doc_id: str | int, update_data: dict[str, Any] ) -> None: """ Update a single document in the specified index. @@ -98,7 +98,7 @@ def update_document( except exceptions.RequestError as e: log.error(f"Error updating document {doc_id} in index {index_name}: {e}") - def delete_document(self, index_name: str, doc_id: str) -> None: + def delete_document(self, index_name: str, doc_id: str | int) -> None: """ Delete a single document from the specified index. @@ -115,7 +115,7 @@ def delete_document(self, index_name: str, doc_id: str) -> None: log.error(f"Error deleting document {doc_id} from index {index_name}: {e}") def index_document( - self, index_name: str, doc_id: str, document: dict[str, Any] + self, index_name: str, doc_id: str | int, document: dict[str, Any] ) -> None: """ Index a single document in the specified Elasticsearch index. diff --git a/forum/search/meilisearch.py b/forum/search/meilisearch.py index 3338cf2..aea8fc7 100644 --- a/forum/search/meilisearch.py +++ b/forum/search/meilisearch.py @@ -48,15 +48,14 @@ def create_document(document: dict[str, t.Any], doc_id: str) -> dict[str, t.Any] """ We index small documents in Meilisearch, with just a handful of fields. """ - # THE CAKE IS A LIE. Sometimes the doc_id is an integer. - doc_id = str(doc_id) processed = {"id": doc_id, m.PRIMARY_KEY_FIELD_NAME: m.id2pk(doc_id)} for field in ALL_FIELDS: if field in document: processed[field] = document[field] # We remove html markup, which breaks search in some places. For instance # "<p>Word" will not match "Word", which is a shame. - processed["body"] = BeautifulSoup(processed["body"]).get_text() + if body := processed.get("body"): + processed["body"] = BeautifulSoup(body, features="html.parser").get_text() return processed @@ -68,29 +67,29 @@ class MeilisearchDocumentBackend( """ def index_document( - self, index_name: str, doc_id: str, document: dict[str, t.Any] + self, index_name: str, doc_id: str | int, document: dict[str, t.Any] ) -> None: """ Insert a single document in the Meilisearch index. """ meilisearch_index = self.get_index(index_name) - processed = create_document(document, doc_id) + processed = create_document(document, str(doc_id)) meilisearch_index.add_documents([processed]) def update_document( - self, index_name: str, doc_id: str, update_data: dict[str, t.Any] + self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any] ) -> None: """ Updating is the same as inserting in meilisearch """ return self.index_document(index_name, doc_id, update_data) - def delete_document(self, index_name: str, doc_id: str) -> None: + def delete_document(self, index_name: str, doc_id: str | int) -> None: """ Delete a single document, identified by its ID. """ meilisearch_index = self.get_index(index_name) - doc_pk = m.id2pk(doc_id) + doc_pk = m.id2pk(str(doc_id)) meilisearch_index.delete_document(doc_pk) @@ -100,12 +99,7 @@ class MeilisearchIndexBackend(base.BaseIndexSearchBackend, MeilisearchClientMixi """ def initialize_indices(self, force_new_index: bool = False) -> None: - filterable_fields = [ - m.PRIMARY_KEY_FIELD_NAME, - "context", - "course_id", - "commentable_id", - ] + filterable_fields = [m.PRIMARY_KEY_FIELD_NAME] + FILTERABLE_FIELDS index_filterables = { Model.index_name: filterable_fields for Model in MODEL_INDICES } @@ -132,7 +126,7 @@ def rebuild_indices( for page_number in paginator.page_range: page = paginator.get_page(page_number) documents = [ - create_document(obj.doc_to_hash(), obj.id) + create_document(obj.doc_to_hash(), str(obj.id)) for obj in page.object_list ] if documents: diff --git a/test_utils/mock_es_backend.py b/test_utils/mock_es_backend.py index ae01351..781d50f 100644 --- a/test_utils/mock_es_backend.py +++ b/test_utils/mock_es_backend.py @@ -51,14 +51,14 @@ class MockElasticsearchDocumentBackend(ElasticsearchDocumentBackend): """ def update_document( - self, index_name: str, doc_id: str, update_data: dict[str, Any] + self, index_name: str, doc_id: str | int, update_data: dict[str, Any] ) -> None: """Mock method for updating a document in Elasticsearch.""" - def delete_document(self, index_name: str, doc_id: str) -> None: + def delete_document(self, index_name: str, doc_id: str | int) -> None: """Mock method for deleting a document from Elasticsearch.""" def index_document( - self, index_name: str, doc_id: str, document: dict[str, Any] + self, index_name: str, doc_id: str | int, document: dict[str, Any] ) -> None: """Mock method for indexing a document in Elasticsearch.""" diff --git a/tests/test_meilisearch.py b/tests/test_meilisearch.py new file mode 100644 index 0000000..20672e9 --- /dev/null +++ b/tests/test_meilisearch.py @@ -0,0 +1,78 @@ +""" +Unit tests for the meilisearch search backend. +""" + +from unittest.mock import patch, Mock + +import search.meilisearch as m +from forum.search import meilisearch + +TEST_ID = "abcd" +TEST_PK = m.id2pk(TEST_ID) + + +def test_create_document() -> None: + assert { + "id": TEST_ID, + m.PRIMARY_KEY_FIELD_NAME: TEST_PK, + } == meilisearch.create_document({}, TEST_ID) + + assert { + "id": TEST_ID, + m.PRIMARY_KEY_FIELD_NAME: TEST_PK, + "body": "Somebody", + } == meilisearch.create_document({"body": "Somebody"}, TEST_ID) + + assert { + "id": TEST_ID, + m.PRIMARY_KEY_FIELD_NAME: TEST_PK, + "course_id": "some/course/id", + } == meilisearch.create_document({"course_id": "some/course/id"}, TEST_ID) + + assert { + "id": TEST_ID, + m.PRIMARY_KEY_FIELD_NAME: TEST_PK, + } == meilisearch.create_document( + {"field_should_not_be_here": "some_value"}, TEST_ID + ) + + assert { + "id": TEST_ID, + m.PRIMARY_KEY_FIELD_NAME: TEST_PK, + "body": "Somebody", + } == meilisearch.create_document({"body": "<p>Somebody</p>"}, TEST_ID) + + +def test_index_document() -> None: + backend = meilisearch.MeilisearchDocumentBackend() + with patch.object( + backend, "get_index", return_value=Mock(add_documents=Mock()) + ) as mock_get_index: + backend.index_document( + "my_index", + TEST_ID, + { + "body": "<p>Some body</p>", + "some other field": "some value", + }, + ) + mock_get_index.assert_called_once_with("my_index") + mock_get_index().add_documents.assert_called_once_with( + [ + { + "id": TEST_ID, + "_pk": TEST_PK, + "body": "Some body", + } + ] + ) + + +def test_delete_document() -> None: + backend = meilisearch.MeilisearchDocumentBackend() + with patch.object( + backend, "get_index", return_value=Mock(add_documents=Mock()) + ) as mock_get_index: + backend.delete_document("my_index", TEST_ID) + mock_get_index.assert_called_once_with("my_index") + mock_get_index().delete_document.assert_called_once_with(TEST_PK)