Skip to content

Commit

Permalink
fix: meilisearch document deletion
Browse files Browse the repository at this point in the history
In production, we sometimes observe the following error on document
deletion:

    	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/forum/handlers.py", line 115, in handle_deletion
	lms-1  |     search_backend.delete_document(sender.index_name, document_id)
	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/forum/search/meilisearch.py", line 93, in delete_document
	lms-1  |     doc_pk = m.id2pk(doc_id)
	lms-1  |              ^^^^^^^^^^^^^^^
	lms-1  |   File "/openedx/venv/lib/python3.11/site-packages/search/meilisearch.py", line 362, in id2pk
	lms-1  |     return hashlib.sha1(value.encode()).hexdigest()
	lms-1  |                         ^^^^^^^^^^^^
	lms-1  | AttributeError: 'int' object has no attribute 'encode'

This is due to the fact that the doc_id passed to the delete_document
method is sometimes an integer, sometimes a str. To reflect that, we
updated the base function signature. We also improved the unit test
coverage. Not all functions are covered yet, but it's getting better.

Note that to reproduce this issue we should use MySQL as a storage
backend.
  • Loading branch information
regisb committed Nov 19, 2024
1 parent f924071 commit 1609890
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 24 deletions.
6 changes: 3 additions & 3 deletions forum/search/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@ class BaseDocumentSearchBackend:
"""

def index_document(
self, index_name: str, doc_id: str, document: dict[str, t.Any]
self, index_name: str, doc_id: str | int, document: dict[str, t.Any]
) -> None:
raise NotImplementedError

def update_document(
self, index_name: str, doc_id: str, update_data: dict[str, t.Any]
self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any]
) -> None:
raise NotImplementedError

def delete_document(self, index_name: str, doc_id: str) -> None:
def delete_document(self, index_name: str, doc_id: str | int) -> None:
raise NotImplementedError


Expand Down
6 changes: 3 additions & 3 deletions forum/search/es.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class ElasticsearchDocumentBackend(
"""

def update_document(
self, index_name: str, doc_id: str, update_data: dict[str, Any]
self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
) -> None:
"""
Update a single document in the specified index.
Expand All @@ -98,7 +98,7 @@ def update_document(
except exceptions.RequestError as e:
log.error(f"Error updating document {doc_id} in index {index_name}: {e}")

def delete_document(self, index_name: str, doc_id: str) -> None:
def delete_document(self, index_name: str, doc_id: str | int) -> None:
"""
Delete a single document from the specified index.
Expand All @@ -115,7 +115,7 @@ def delete_document(self, index_name: str, doc_id: str) -> None:
log.error(f"Error deleting document {doc_id} from index {index_name}: {e}")

def index_document(
self, index_name: str, doc_id: str, document: dict[str, Any]
self, index_name: str, doc_id: str | int, document: dict[str, Any]
) -> None:
"""
Index a single document in the specified Elasticsearch index.
Expand Down
24 changes: 9 additions & 15 deletions forum/search/meilisearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,14 @@ def create_document(document: dict[str, t.Any], doc_id: str) -> dict[str, t.Any]
"""
We index small documents in Meilisearch, with just a handful of fields.
"""
# THE CAKE IS A LIE. Sometimes the doc_id is an integer.
doc_id = str(doc_id)
processed = {"id": doc_id, m.PRIMARY_KEY_FIELD_NAME: m.id2pk(doc_id)}
for field in ALL_FIELDS:
if field in document:
processed[field] = document[field]
# We remove html markup, which breaks search in some places. For instance
# "<p>Word" will not match "Word", which is a shame.
processed["body"] = BeautifulSoup(processed["body"]).get_text()
if body := processed.get("body"):
processed["body"] = BeautifulSoup(body, features="html.parser").get_text()
return processed


Expand All @@ -68,29 +67,29 @@ class MeilisearchDocumentBackend(
"""

def index_document(
self, index_name: str, doc_id: str, document: dict[str, t.Any]
self, index_name: str, doc_id: str | int, document: dict[str, t.Any]
) -> None:
"""
Insert a single document in the Meilisearch index.
"""
meilisearch_index = self.get_index(index_name)
processed = create_document(document, doc_id)
processed = create_document(document, str(doc_id))
meilisearch_index.add_documents([processed])

def update_document(
self, index_name: str, doc_id: str, update_data: dict[str, t.Any]
self, index_name: str, doc_id: str | int, update_data: dict[str, t.Any]
) -> None:
"""
Updating is the same as inserting in meilisearch
"""
return self.index_document(index_name, doc_id, update_data)

def delete_document(self, index_name: str, doc_id: str) -> None:
def delete_document(self, index_name: str, doc_id: str | int) -> None:
"""
Delete a single document, identified by its ID.
"""
meilisearch_index = self.get_index(index_name)
doc_pk = m.id2pk(doc_id)
doc_pk = m.id2pk(str(doc_id))
meilisearch_index.delete_document(doc_pk)


Expand All @@ -100,12 +99,7 @@ class MeilisearchIndexBackend(base.BaseIndexSearchBackend, MeilisearchClientMixi
"""

def initialize_indices(self, force_new_index: bool = False) -> None:
filterable_fields = [
m.PRIMARY_KEY_FIELD_NAME,
"context",
"course_id",
"commentable_id",
]
filterable_fields = [m.PRIMARY_KEY_FIELD_NAME] + FILTERABLE_FIELDS
index_filterables = {
Model.index_name: filterable_fields for Model in MODEL_INDICES
}
Expand All @@ -132,7 +126,7 @@ def rebuild_indices(
for page_number in paginator.page_range:
page = paginator.get_page(page_number)
documents = [
create_document(obj.doc_to_hash(), obj.id)
create_document(obj.doc_to_hash(), str(obj.id))
for obj in page.object_list
]
if documents:
Expand Down
6 changes: 3 additions & 3 deletions test_utils/mock_es_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@ class MockElasticsearchDocumentBackend(ElasticsearchDocumentBackend):
"""

def update_document(
self, index_name: str, doc_id: str, update_data: dict[str, Any]
self, index_name: str, doc_id: str | int, update_data: dict[str, Any]
) -> None:
"""Mock method for updating a document in Elasticsearch."""

def delete_document(self, index_name: str, doc_id: str) -> None:
def delete_document(self, index_name: str, doc_id: str | int) -> None:
"""Mock method for deleting a document from Elasticsearch."""

def index_document(
self, index_name: str, doc_id: str, document: dict[str, Any]
self, index_name: str, doc_id: str | int, document: dict[str, Any]
) -> None:
"""Mock method for indexing a document in Elasticsearch."""
78 changes: 78 additions & 0 deletions tests/test_meilisearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""
Unit tests for the meilisearch search backend.
"""

from unittest.mock import patch, Mock

import search.meilisearch as m
from forum.search import meilisearch

TEST_ID = "abcd"
TEST_PK = m.id2pk(TEST_ID)


def test_create_document() -> None:
assert {
"id": TEST_ID,
m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
} == meilisearch.create_document({}, TEST_ID)

assert {
"id": TEST_ID,
m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
"body": "Somebody",
} == meilisearch.create_document({"body": "Somebody"}, TEST_ID)

assert {
"id": TEST_ID,
m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
"course_id": "some/course/id",
} == meilisearch.create_document({"course_id": "some/course/id"}, TEST_ID)

assert {
"id": TEST_ID,
m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
} == meilisearch.create_document(
{"field_should_not_be_here": "some_value"}, TEST_ID
)

assert {
"id": TEST_ID,
m.PRIMARY_KEY_FIELD_NAME: TEST_PK,
"body": "Somebody",
} == meilisearch.create_document({"body": "<p>Somebody</p>"}, TEST_ID)


def test_index_document() -> None:
backend = meilisearch.MeilisearchDocumentBackend()
with patch.object(
backend, "get_index", return_value=Mock(add_documents=Mock())
) as mock_get_index:
backend.index_document(
"my_index",
TEST_ID,
{
"body": "<p>Some body</p>",
"some other field": "some value",
},
)
mock_get_index.assert_called_once_with("my_index")
mock_get_index().add_documents.assert_called_once_with(
[
{
"id": TEST_ID,
"_pk": TEST_PK,
"body": "Some body",
}
]
)


def test_delete_document() -> None:
backend = meilisearch.MeilisearchDocumentBackend()
with patch.object(
backend, "get_index", return_value=Mock(add_documents=Mock())
) as mock_get_index:
backend.delete_document("my_index", TEST_ID)
mock_get_index.assert_called_once_with("my_index")
mock_get_index().delete_document.assert_called_once_with(TEST_PK)

0 comments on commit 1609890

Please sign in to comment.