-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: management command to index content libraries using Meilisearch
- Loading branch information
1 parent
7fe4c06
commit 463549f
Showing
8 changed files
with
271 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
132 changes: 132 additions & 0 deletions
132
openedx/core/djangoapps/content_libraries/management/commands/reindex_libraries.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
""" | ||
Command to build or re-build the search index for content libraries. | ||
""" | ||
import logging | ||
import time | ||
|
||
from django.conf import settings | ||
from django.core.management import BaseCommand, CommandError | ||
import meilisearch | ||
from meilisearch.errors import MeilisearchError | ||
from meilisearch.models.task import TaskInfo | ||
|
||
from openedx.core.djangoapps.content_libraries import api as lib_api | ||
from openedx.core.djangoapps.content_libraries.search import searchable_doc_for_library_block | ||
from openedx.core.djangoapps.content_libraries.models import ContentLibrary | ||
|
||
|
||
log = logging.getLogger(__name__) | ||
|
||
LIBRARIES_INDEX_NAME = "content_libraries" | ||
|
||
|
||
class Command(BaseCommand): | ||
""" | ||
Build or re-build the search index for content libraries. | ||
""" | ||
|
||
def handle(self, *args, **options): | ||
""" | ||
Build a new search index | ||
""" | ||
|
||
# Connect to Meilisearch | ||
if not settings.MEILISEARCH_URL: | ||
raise CommandError("MEILISEARCH_URL is not set - search functionality disabled.") | ||
# TODO: put this into a helper lib? | ||
client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY) | ||
try: | ||
client.health() | ||
except MeilisearchError as err: | ||
self.stderr.write(err.message) # print this because 'raise...from...' doesn't print the details | ||
raise CommandError("Unable to connect to Meilisearch") from err | ||
|
||
# Get the list of libraries | ||
self.stdout.write("Counting libraries...") | ||
lib_keys = [lib.library_key for lib in ContentLibrary.objects.select_related('org').only('org', 'slug')] | ||
blocks_by_lib_key = {} | ||
num_blocks = 0 | ||
for lib_key in lib_keys: | ||
blocks_by_lib_key[lib_key] = [] | ||
for component in lib_api.get_library_components(lib_key): | ||
blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component)) | ||
num_blocks += 1 | ||
|
||
self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.") | ||
|
||
# Check if the index exists already: | ||
self.stdout.write("Checking index...") | ||
index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME | ||
temp_index_name = index_name + "_new" | ||
try: | ||
client.get_index(temp_index_name) | ||
except MeilisearchError as err: | ||
pass | ||
else: | ||
self.stdout.write("Index already exists. Deleting it...") | ||
self._wait_for_meili_task(client, client.delete_index(temp_index_name)) | ||
|
||
self.stdout.write("Creating new index...") | ||
self._wait_for_meili_task( | ||
client, | ||
client.create_index(temp_index_name, {'primaryKey': 'id'}) | ||
) | ||
|
||
self.stdout.write("Indexing documents...") | ||
num_done = 0 | ||
for lib_key in lib_keys: | ||
self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}") | ||
docs = [] | ||
for metadata in blocks_by_lib_key[lib_key]: | ||
doc = searchable_doc_for_library_block(metadata) | ||
docs.append(doc) | ||
# Add all the docs in this library at once (usually faster than adding one at a time): | ||
self._wait_for_meili_task(client, client.index(temp_index_name).add_documents(docs)) | ||
num_done += len(docs) | ||
|
||
new_index_created = client.get_index(temp_index_name).created_at | ||
if not self._index_exists(index_name, client): | ||
# We have to create the "target" index before we can successfully swap the new one into it: | ||
self.stdout.write(f"Preparing to swap into index (first time)...") | ||
self._wait_for_meili_task(client, client.create_index(index_name)) | ||
self.stdout.write(f"Swapping index...") | ||
client.swap_indexes([{'indexes': [temp_index_name, index_name]}]) | ||
# If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status | ||
# of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103 | ||
while True: | ||
time.sleep(1) | ||
if client.get_index(index_name).created_at != new_index_created: | ||
self.stdout.write("Waiting for swap completion...") | ||
else: | ||
break | ||
self.stdout.write("Deleting old index...") | ||
self._wait_for_meili_task(client, client.delete_index(temp_index_name)) | ||
|
||
self.stdout.write(f"Done! {num_blocks} blocks indexed.") | ||
|
||
def _wait_for_meili_task(self, client: meilisearch.Client, info: TaskInfo): | ||
""" | ||
Simple helper method to wait for a Meilisearch task to complete | ||
""" | ||
current_status = client.get_task(info.task_uid) | ||
while current_status.status in ("enqueued", "processing"): | ||
self.stdout.write("...") | ||
time.sleep(1) | ||
current_status = client.get_task(info.task_uid) | ||
if current_status.status != "succeeded": | ||
self.stderr.write(f"Task has status: {current_status.status}") | ||
self.stderr.write(str(current_status.error)) | ||
try: | ||
raise MeilisearchError(current_status.error['message']) | ||
except (TypeError, KeyError): | ||
raise MeilisearchError("Unknown error") | ||
|
||
def _index_exists(self, index_name: str, client: meilisearch.Client) -> bool: | ||
""" | ||
Check if an index exists | ||
""" | ||
try: | ||
client.get_index(index_name) | ||
except MeilisearchError as err: | ||
return False | ||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import logging | ||
|
||
from django.utils.text import slugify | ||
|
||
from openedx.core.djangoapps.content_libraries import api as lib_api | ||
from openedx.core.djangoapps.xblock import api as xblock_api | ||
|
||
log = logging.getLogger(__name__) | ||
|
||
|
||
def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict: | ||
""" | ||
Generate a dictionary document suitable for ingestion into a search engine | ||
like Meilisearch or Elasticsearch, so that the given library block can be | ||
found using faceted search. | ||
""" | ||
doc = {} | ||
try: | ||
block = xblock_api.load_block(metadata.usage_key, user=None) | ||
block_data = block.index_dictionary() | ||
# Will be something like: | ||
# { | ||
# 'content': {'display_name': '...', 'capa_content': '...'}, | ||
# 'content_type': 'CAPA', | ||
# 'problem_types': ['multiplechoiceresponse'] | ||
# } | ||
# Which we need to flatten: | ||
if "content_type" in block_data: | ||
del block_data["content_type"] # Redundant with our "type" field | ||
if "content" in block_data and isinstance(block_data["content"], dict): | ||
content = block_data["content"] | ||
if "display_name" in content: | ||
del content["display_name"] | ||
del block_data["content"] | ||
block_data.update(content) | ||
# Now we have | ||
# { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] } | ||
doc.update(block_data) | ||
except Exception as err: | ||
log.exception(f"Failed to get index_dictionary for {metadata.usage_key}: {err}") | ||
# The data below must always override any values from index_dictionary: | ||
doc.update({ | ||
# A Meilisearch document identifier can be of type integer or string, | ||
# only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_). | ||
# So our usage keys with ":" characters cannot be used as primary keys. | ||
"id": slugify(str(metadata.usage_key)) + "-" + str(hash(str(metadata.usage_key)) % 1_000), | ||
"usage_key": str(metadata.usage_key), | ||
"block_id": str(metadata.usage_key.block_id), | ||
"display_name": metadata.display_name, | ||
"type": metadata.usage_key.block_type, | ||
# This is called contextKey not libKey so we can use the same keys with courses, and maybe search | ||
# both courses and libraries together in the future? | ||
"context_key": str(metadata.usage_key.context_key), # same as lib_key | ||
"org": str(metadata.usage_key.context_key.org), | ||
}) | ||
return doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.