Skip to content

Commit

Permalink
feat: management command to index content libraries using Meilisearch
Browse files Browse the repository at this point in the history
  • Loading branch information
bradenmacdonald committed Feb 28, 2024
1 parent 7fe4c06 commit 463549f
Show file tree
Hide file tree
Showing 8 changed files with 271 additions and 7 deletions.
11 changes: 11 additions & 0 deletions cms/envs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2869,3 +2869,14 @@ def _should_send_xblock_events(settings):
REST_FRAMEWORK['DEFAULT_SCHEMA_CLASS'] = 'drf_spectacular.openapi.AutoSchema'

BEAMER_PRODUCT_ID = ""

################### Search ###################

# To support multi-tenancy, you can prefix all indexes with a common key like "sandbox7-"
# and use a restricted tenant token in place of an API key, so that this Open edX instance
# can only use the index(es) that start with this prefix.
# See https://www.meilisearch.com/docs/learn/security/tenant_tokens
MEILISEARCH_INDEX_PREFIX = ""
# Set this to None to disable search functionality
MEILISEARCH_URL = "http://meilisearch"
MEILISEARCH_API_KEY = "devkey"
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""
Command to build or re-build the search index for content libraries.
"""
import logging
import time

from django.conf import settings
from django.core.management import BaseCommand, CommandError
import meilisearch
from meilisearch.errors import MeilisearchError
from meilisearch.models.task import TaskInfo

from openedx.core.djangoapps.content_libraries import api as lib_api
from openedx.core.djangoapps.content_libraries.search import searchable_doc_for_library_block
from openedx.core.djangoapps.content_libraries.models import ContentLibrary


log = logging.getLogger(__name__)

LIBRARIES_INDEX_NAME = "content_libraries"


class Command(BaseCommand):
"""
Build or re-build the search index for content libraries.
"""

def handle(self, *args, **options):
"""
Build a new search index
"""

# Connect to Meilisearch
if not settings.MEILISEARCH_URL:
raise CommandError("MEILISEARCH_URL is not set - search functionality disabled.")
# TODO: put this into a helper lib?
client = meilisearch.Client(settings.MEILISEARCH_URL, settings.MEILISEARCH_API_KEY)
try:
client.health()
except MeilisearchError as err:
self.stderr.write(err.message) # print this because 'raise...from...' doesn't print the details
raise CommandError("Unable to connect to Meilisearch") from err

# Get the list of libraries
self.stdout.write("Counting libraries...")
lib_keys = [lib.library_key for lib in ContentLibrary.objects.select_related('org').only('org', 'slug')]
blocks_by_lib_key = {}
num_blocks = 0
for lib_key in lib_keys:
blocks_by_lib_key[lib_key] = []
for component in lib_api.get_library_components(lib_key):
blocks_by_lib_key[lib_key].append(lib_api.LibraryXBlockMetadata.from_component(lib_key, component))
num_blocks += 1

self.stdout.write(f"Found {num_blocks} XBlocks among {len(lib_keys)} libraries.")

# Check if the index exists already:
self.stdout.write("Checking index...")
index_name = settings.MEILISEARCH_INDEX_PREFIX + LIBRARIES_INDEX_NAME
temp_index_name = index_name + "_new"
try:
client.get_index(temp_index_name)
except MeilisearchError as err:
pass
else:
self.stdout.write("Index already exists. Deleting it...")
self._wait_for_meili_task(client, client.delete_index(temp_index_name))

self.stdout.write("Creating new index...")
self._wait_for_meili_task(
client,
client.create_index(temp_index_name, {'primaryKey': 'id'})
)

self.stdout.write("Indexing documents...")
num_done = 0
for lib_key in lib_keys:
self.stdout.write(f"{num_done}/{num_blocks}. Now indexing {lib_key}")
docs = []
for metadata in blocks_by_lib_key[lib_key]:
doc = searchable_doc_for_library_block(metadata)
docs.append(doc)
# Add all the docs in this library at once (usually faster than adding one at a time):
self._wait_for_meili_task(client, client.index(temp_index_name).add_documents(docs))
num_done += len(docs)

new_index_created = client.get_index(temp_index_name).created_at
if not self._index_exists(index_name, client):
# We have to create the "target" index before we can successfully swap the new one into it:
self.stdout.write(f"Preparing to swap into index (first time)...")
self._wait_for_meili_task(client, client.create_index(index_name))
self.stdout.write(f"Swapping index...")
client.swap_indexes([{'indexes': [temp_index_name, index_name]}])
# If we're using an API key that's restricted to certain index prefix(es), we won't be able to get the status
# of this request unfortunately. https://github.com/meilisearch/meilisearch/issues/4103
while True:
time.sleep(1)
if client.get_index(index_name).created_at != new_index_created:
self.stdout.write("Waiting for swap completion...")
else:
break
self.stdout.write("Deleting old index...")
self._wait_for_meili_task(client, client.delete_index(temp_index_name))

self.stdout.write(f"Done! {num_blocks} blocks indexed.")

def _wait_for_meili_task(self, client: meilisearch.Client, info: TaskInfo):
"""
Simple helper method to wait for a Meilisearch task to complete
"""
current_status = client.get_task(info.task_uid)
while current_status.status in ("enqueued", "processing"):
self.stdout.write("...")
time.sleep(1)
current_status = client.get_task(info.task_uid)
if current_status.status != "succeeded":
self.stderr.write(f"Task has status: {current_status.status}")
self.stderr.write(str(current_status.error))
try:
raise MeilisearchError(current_status.error['message'])
except (TypeError, KeyError):
raise MeilisearchError("Unknown error")

def _index_exists(self, index_name: str, client: meilisearch.Client) -> bool:
"""
Check if an index exists
"""
try:
client.get_index(index_name)
except MeilisearchError as err:
return False
return True
56 changes: 56 additions & 0 deletions openedx/core/djangoapps/content_libraries/search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging

from django.utils.text import slugify

from openedx.core.djangoapps.content_libraries import api as lib_api
from openedx.core.djangoapps.xblock import api as xblock_api

log = logging.getLogger(__name__)


def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, so that the given library block can be
found using faceted search.
"""
doc = {}
try:
block = xblock_api.load_block(metadata.usage_key, user=None)
block_data = block.index_dictionary()
# Will be something like:
# {
# 'content': {'display_name': '...', 'capa_content': '...'},
# 'content_type': 'CAPA',
# 'problem_types': ['multiplechoiceresponse']
# }
# Which we need to flatten:
if "content_type" in block_data:
del block_data["content_type"] # Redundant with our "type" field
if "content" in block_data and isinstance(block_data["content"], dict):
content = block_data["content"]
if "display_name" in content:
del content["display_name"]
del block_data["content"]
block_data.update(content)
# Now we have
# { 'capa_content': '...', 'problem_types': ['multiplechoiceresponse'] }
doc.update(block_data)
except Exception as err:
log.exception(f"Failed to get index_dictionary for {metadata.usage_key}: {err}")
# The data below must always override any values from index_dictionary:
doc.update({
# A Meilisearch document identifier can be of type integer or string,
# only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).
# So our usage keys with ":" characters cannot be used as primary keys.
"id": slugify(str(metadata.usage_key)) + "-" + str(hash(str(metadata.usage_key)) % 1_000),
"usage_key": str(metadata.usage_key),
"block_id": str(metadata.usage_key.block_id),
"display_name": metadata.display_name,
"type": metadata.usage_key.block_type,
# This is called contextKey not libKey so we can use the same keys with courses, and maybe search
# both courses and libraries together in the future?
"context_key": str(metadata.usage_key.context_key), # same as lib_key
"org": str(metadata.usage_key.context_key.org),
})
return doc
14 changes: 14 additions & 0 deletions requirements/edx/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ analytics-python==1.4.post1
# via -r requirements/edx/kernel.in
aniso8601==9.0.1
# via edx-tincan-py35
annotated-types==0.6.0
# via pydantic
appdirs==1.4.4
# via fs
asgiref==3.7.2
Expand Down Expand Up @@ -87,6 +89,8 @@ botocore==1.34.45
# s3transfer
bridgekeeper==0.9
# via -r requirements/edx/kernel.in
camel-converter[pydantic]==3.1.1
# via meilisearch
celery==5.3.6
# via
# -c requirements/edx/../constraints.txt
Expand Down Expand Up @@ -716,6 +720,8 @@ markupsafe==2.1.5
# xblock
maxminddb==2.5.2
# via geoip2
meilisearch==0.30.0
# via -r requirements/edx/kernel.in
mock==5.1.0
# via -r requirements/edx/paver.txt
mongoengine==0.27.0
Expand Down Expand Up @@ -860,6 +866,10 @@ pycryptodomex==3.20.0
# edx-proctoring
# lti-consumer-xblock
# pyjwkest
pydantic==2.6.3
# via camel-converter
pydantic-core==2.16.3
# via pydantic
pygments==2.17.2
# via
# -r requirements/edx/bundled.in
Expand Down Expand Up @@ -999,6 +1009,7 @@ requests==2.31.0
# edx-rest-api-client
# geoip2
# mailsnake
# meilisearch
# openai
# optimizely-sdk
# pyjwkest
Expand Down Expand Up @@ -1141,12 +1152,15 @@ tqdm==4.66.2
typing-extensions==4.9.0
# via
# -r requirements/edx/paver.txt
# annotated-types
# asgiref
# django-countries
# drf-spectacular
# edx-opaque-keys
# jwcrypto
# kombu
# pydantic
# pydantic-core
# pylti1p3
# snowflake-connector-python
tzdata==2024.1
Expand Down
18 changes: 16 additions & 2 deletions requirements/edx/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ aniso8601==9.0.1
# edx-tincan-py35
annotated-types==0.6.0
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
# pydantic
anyio==4.3.0
Expand Down Expand Up @@ -168,6 +169,11 @@ cachetools==5.3.2
# via
# -r requirements/edx/testing.txt
# tox
camel-converter[pydantic]==3.1.1
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
# meilisearch
celery==5.3.6
# via
# -c requirements/edx/../constraints.txt
Expand Down Expand Up @@ -1187,6 +1193,10 @@ mccabe==0.7.0
# via
# -r requirements/edx/testing.txt
# pylint
meilisearch==0.30.0
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
mistune==2.0.5
# via
# -r requirements/edx/doc.txt
Expand Down Expand Up @@ -1459,12 +1469,15 @@ pycryptodomex==3.20.0
# edx-proctoring
# lti-consumer-xblock
# pyjwkest
pydantic==2.6.1
pydantic==2.6.3
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
# camel-converter
# fastapi
pydantic-core==2.16.2
pydantic-core==2.16.3
# via
# -r requirements/edx/doc.txt
# -r requirements/edx/testing.txt
# pydantic
pydata-sphinx-theme==0.14.4
Expand Down Expand Up @@ -1736,6 +1749,7 @@ requests==2.31.0
# edx-rest-api-client
# geoip2
# mailsnake
# meilisearch
# openai
# optimizely-sdk
# pact-python
Expand Down
22 changes: 22 additions & 0 deletions requirements/edx/doc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ aniso8601==9.0.1
# via
# -r requirements/edx/base.txt
# edx-tincan-py35
annotated-types==0.6.0
# via
# -r requirements/edx/base.txt
# pydantic
appdirs==1.4.4
# via
# -r requirements/edx/base.txt
Expand Down Expand Up @@ -114,6 +118,10 @@ botocore==1.34.45
# s3transfer
bridgekeeper==0.9
# via -r requirements/edx/base.txt
camel-converter[pydantic]==3.1.1
# via
# -r requirements/edx/base.txt
# meilisearch
celery==5.3.6
# via
# -c requirements/edx/../constraints.txt
Expand Down Expand Up @@ -840,6 +848,8 @@ maxminddb==2.5.2
# via
# -r requirements/edx/base.txt
# geoip2
meilisearch==0.30.0
# via -r requirements/edx/base.txt
mistune==2.0.5
# via sphinx-mdinclude
mock==5.1.0
Expand Down Expand Up @@ -1016,6 +1026,14 @@ pycryptodomex==3.20.0
# edx-proctoring
# lti-consumer-xblock
# pyjwkest
pydantic==2.6.3
# via
# -r requirements/edx/base.txt
# camel-converter
pydantic-core==2.16.3
# via
# -r requirements/edx/base.txt
# pydantic
pydata-sphinx-theme==0.14.4
# via sphinx-book-theme
pygments==2.17.2
Expand Down Expand Up @@ -1178,6 +1196,7 @@ requests==2.31.0
# edx-rest-api-client
# geoip2
# mailsnake
# meilisearch
# openai
# optimizely-sdk
# pyjwkest
Expand Down Expand Up @@ -1389,12 +1408,15 @@ tqdm==4.66.2
typing-extensions==4.9.0
# via
# -r requirements/edx/base.txt
# annotated-types
# asgiref
# django-countries
# drf-spectacular
# edx-opaque-keys
# jwcrypto
# kombu
# pydantic
# pydantic-core
# pydata-sphinx-theme
# pylti1p3
# snowflake-connector-python
Expand Down
Loading

0 comments on commit 463549f

Please sign in to comment.