From b07464ba2dc4e397af799e40effd2e267516ea2a Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Fri, 6 Dec 2024 17:30:38 -0300 Subject: [PATCH] feat: incremental reindex_studio management command (#35864) This allows large instances to run an (interruptable, resumable) reindex task that can cover thousands of courses. --- openedx/core/djangoapps/content/search/api.py | 222 +++++++++++------- .../djangoapps/content/search/index_config.py | 70 ++++++ .../management/commands/reindex_studio.py | 16 +- .../0002_incrementalindexcompleted.py | 21 ++ .../core/djangoapps/content/search/models.py | 12 + .../content/search/tests/test_api.py | 85 ++++++- 6 files changed, 342 insertions(+), 84 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/index_config.py create mode 100644 openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index b1d224b411e4..a18c55bd3d22 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -5,7 +5,7 @@ import logging import time -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from datetime import datetime, timedelta, timezone from functools import wraps from typing import Callable, Generator @@ -24,7 +24,14 @@ from rest_framework.request import Request from common.djangoapps.student.role_helpers import get_course_roles from openedx.core.djangoapps.content.course_overviews.models import CourseOverview -from openedx.core.djangoapps.content.search.models import get_access_ids_for_request +from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted +from openedx.core.djangoapps.content.search.index_config import ( + INDEX_DISTINCT_ATTRIBUTE, + INDEX_FILTERABLE_ATTRIBUTES, + INDEX_SEARCHABLE_ATTRIBUTES, + INDEX_SORTABLE_ATTRIBUTES, + INDEX_RANKING_RULES, +) from openedx.core.djangoapps.content_libraries import api as lib_api from xmodule.modulestore.django import modulestore @@ -217,6 +224,42 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat _wait_for_meili_task(client.delete_index(temp_index_name)) +def _index_is_empty(index_name: str) -> bool: + """ + Check if an index is empty + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + return index.get_stats().number_of_documents == 0 + + +def _configure_index(index_name): + """ + Configure the index. The following index settings are best changed on an empty index. + Changing them on a populated index will "re-index all documents in the index", which can take some time. + + Args: + index_name (str): The name of the index to configure + """ + client = _get_meilisearch_client() + + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): + client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE) + # Mark which attributes can be used for filtering/faceted search: + client.index(index_name).update_filterable_attributes(INDEX_FILTERABLE_ATTRIBUTES) + # Mark which attributes are used for keyword search, in order of importance: + client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES) + # Mark which attributes can be used for sorting search results: + client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES) + + # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. + # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy + client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES) + + def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: """ Recurse the children of an XBlock and call the given function for each @@ -279,8 +322,75 @@ def is_meilisearch_enabled() -> bool: return False -# pylint: disable=too-many-statements -def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: +def reset_index(status_cb: Callable[[str], None] | None = None) -> None: + """ + Reset the Meilisearch index, deleting all documents and reconfiguring it + """ + if status_cb is None: + status_cb = log.info + + status_cb("Creating new empty index...") + with _using_temp_index(status_cb) as temp_index_name: + _configure_index(temp_index_name) + status_cb("Index recreated!") + status_cb("Index reset complete.") + + +def _is_index_configured(index_name: str) -> bool: + """ + Check if an index is completely configured + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + index_settings = index.get_settings() + for k, v in ( + ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE), + ("filterableAttributes", INDEX_FILTERABLE_ATTRIBUTES), + ("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES), + ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES), + ("rankingRules", INDEX_RANKING_RULES), + ): + setting = index_settings.get(k, []) + if isinstance(v, list): + v = set(v) + setting = set(setting) + if setting != v: + return False + return True + + +def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: + """ + Initialize the Meilisearch index, creating it and configuring it if it doesn't exist + """ + if status_cb is None: + status_cb = log.info + if warn_cb is None: + warn_cb = log.warning + + if _index_exists(STUDIO_INDEX_NAME): + if _index_is_empty(STUDIO_INDEX_NAME): + warn_cb( + "The studio search index is empty. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + if not _is_index_configured(STUDIO_INDEX_NAME): + warn_cb( + "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + status_cb("Index already exists and is configured.") + return + + reset_index(status_cb) + + +def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: # lint-amnesty, pylint: disable=too-many-statements """ Rebuild the Meilisearch index from scratch """ @@ -292,7 +402,14 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: # Get the lists of libraries status_cb("Counting libraries...") - lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + keys_indexed = [] + if incremental: + keys_indexed = list(IncrementalIndexCompleted.objects.values_list("context_key", flat=True)) + lib_keys = [ + lib.library_key + for lib in lib_api.ContentLibrary.objects.select_related("org").only("org", "slug").order_by("-id") + if lib.library_key not in keys_indexed + ] num_libraries = len(lib_keys) # Get the list of courses @@ -300,88 +417,25 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: num_courses = CourseOverview.objects.count() # Some counters so we can track our progress as indexing progresses: - num_contexts = num_courses + num_libraries - num_contexts_done = 0 # How many courses/libraries we've indexed + num_libs_skipped = len(keys_indexed) + num_contexts = num_courses + num_libraries + num_libs_skipped + num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed num_blocks_done = 0 # How many individual components/XBlocks we've indexed status_cb(f"Found {num_courses} courses, {num_libraries} libraries.") - with _using_temp_index(status_cb) as temp_index_name: + with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name: ############## Configure the index ############## - # The following index settings are best changed on an empty index. - # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # The index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index", which can take some time # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. - - # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) - # Mark which attributes can be used for filtering/faceted search: - client.index(temp_index_name).update_filterable_attributes([ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ]) - # Mark which attributes are used for keyword search, in order of importance: - client.index(temp_index_name).update_searchable_attributes([ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ]) - # Mark which attributes can be used for sorting search results: - client.index(temp_index_name).update_sortable_attributes([ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ]) - - # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. - # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(temp_index_name).update_ranking_rules([ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ]) + if not incremental: + _configure_index(index_name) ############## Libraries ############## status_cb("Indexing libraries...") - def index_library(lib_key: str) -> list: + def index_library(lib_key: LibraryLocatorV2) -> list: docs = [] for component in lib_api.get_library_components(lib_key): try: @@ -396,7 +450,7 @@ def index_library(lib_key: str) -> list: if docs: try: # Add all the docs in this library at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing library {lib_key}: {err}") return docs @@ -416,7 +470,7 @@ def index_collection_batch(batch, num_done, library_key) -> int: if docs: try: # Add docs in batch of 100 at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing collection batch {p}: {err}") return num_done @@ -439,6 +493,8 @@ def index_collection_batch(batch, num_done, library_key) -> int: num_collections_done, lib_key, ) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key) status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}") num_contexts_done += 1 @@ -464,7 +520,7 @@ def add_with_children(block): if docs: # Add all the docs in this course at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) return docs paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000) @@ -473,10 +529,16 @@ def add_with_children(block): status_cb( f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" ) + if course.id in keys_indexed: + num_contexts_done += 1 + continue course_docs = index_course(course) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=course.id) num_contexts_done += 1 num_blocks_done += len(course_docs) + IncrementalIndexCompleted.objects.all().delete() status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.") diff --git a/openedx/core/djangoapps/content/search/index_config.py b/openedx/core/djangoapps/content/search/index_config.py new file mode 100644 index 000000000000..9570956e425e --- /dev/null +++ b/openedx/core/djangoapps/content/search/index_config.py @@ -0,0 +1,70 @@ +"""Configuration for the search index.""" +from .documents import Fields + + +INDEX_DISTINCT_ATTRIBUTE = "usage_key" + +# Mark which attributes can be used for filtering/faceted search: +INDEX_FILTERABLE_ATTRIBUTES = [ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, +] + +# Mark which attributes are used for keyword search, in order of importance: +INDEX_SEARCHABLE_ATTRIBUTES = [ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, +] + +# Mark which attributes can be used for sorting search results: +INDEX_SORTABLE_ATTRIBUTES = [ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, +] + +# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. +INDEX_RANKING_RULES = [ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", +] diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 3767ebcba6c9..2d8bb29f7a1f 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -18,8 +18,11 @@ class Command(BaseCommand): """ def add_arguments(self, parser): - parser.add_argument('--experimental', action='store_true') - parser.set_defaults(experimental=False) + parser.add_argument("--experimental", action="store_true") + parser.add_argument("--reset", action="store_true") + parser.add_argument("--init", action="store_true") + parser.add_argument("--incremental", action="store_true") + parser.set_defaults(experimental=False, reset=False, init=False, incremental=False) def handle(self, *args, **options): """ @@ -34,4 +37,11 @@ def handle(self, *args, **options): "Use the --experimental argument to acknowledge and run it." ) - api.rebuild_index(self.stdout.write) + if options["reset"]: + api.reset_index(self.stdout.write) + elif options["init"]: + api.init_index(self.stdout.write, self.stderr.write) + elif options["incremental"]: + api.rebuild_index(self.stdout.write, incremental=True) + else: + api.rebuild_index(self.stdout.write) diff --git a/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py new file mode 100644 index 000000000000..a316c35a7dfe --- /dev/null +++ b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.16 on 2024-11-15 12:40 + +from django.db import migrations, models +import opaque_keys.edx.django.models + + +class Migration(migrations.Migration): + + dependencies = [ + ('search', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='IncrementalIndexCompleted', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('context_key', opaque_keys.edx.django.models.LearningContextKeyField(max_length=255, unique=True)), + ], + ), + ] diff --git a/openedx/core/djangoapps/content/search/models.py b/openedx/core/djangoapps/content/search/models.py index 711c493ff895..6fa53ef17b34 100644 --- a/openedx/core/djangoapps/content/search/models.py +++ b/openedx/core/djangoapps/content/search/models.py @@ -65,3 +65,15 @@ def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) -> course_clause | library_clause ).order_by('-id').values_list("id", flat=True) ) + + +class IncrementalIndexCompleted(models.Model): + """ + Stores the contex keys of aleady indexed courses and libraries for incremental indexing. + """ + + context_key = LearningContextKeyField( + max_length=255, + unique=True, + null=False, + ) diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index 0aa762fd187f..c9c2b2589a31 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -10,8 +10,10 @@ from opaque_keys.edx.keys import UsageKey import ddt +import pytest from django.test import override_settings from freezegun import freeze_time +from meilisearch.errors import MeilisearchApiError from openedx_learning.api import authoring as authoring_api from organizations.tests.factories import OrganizationFactory @@ -26,7 +28,7 @@ try: # This import errors in the lms because content.search is not an installed app there. from .. import api - from ..models import SearchAccess + from ..models import SearchAccess, IncrementalIndexCompleted except RuntimeError: SearchAccess = {} @@ -239,6 +241,87 @@ def test_reindex_meilisearch(self, mock_meilisearch): any_order=True, ) + @override_settings(MEILISEARCH_ENABLED=True) + def test_reindex_meilisearch_incremental(self, mock_meilisearch): + + # Add tags field to doc, since reindex calls includes tags + doc_sequential = copy.deepcopy(self.doc_sequential) + doc_sequential["tags"] = {} + doc_vertical = copy.deepcopy(self.doc_vertical) + doc_vertical["tags"] = {} + doc_problem1 = copy.deepcopy(self.doc_problem1) + doc_problem1["tags"] = {} + doc_problem1["collections"] = {"display_name": [], "key": []} + doc_problem2 = copy.deepcopy(self.doc_problem2) + doc_problem2["tags"] = {} + doc_problem2["collections"] = {"display_name": [], "key": []} + doc_collection = copy.deepcopy(self.collection_dict) + doc_collection["tags"] = {} + + api.rebuild_index(incremental=True) + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 3 + mock_meilisearch.return_value.index.return_value.add_documents.assert_has_calls( + [ + call([doc_sequential, doc_vertical]), + call([doc_problem1, doc_problem2]), + call([doc_collection]), + ], + any_order=True, + ) + + # Now we simulate interruption by passing this function to the status_cb argument + def simulated_interruption(message): + # this exception prevents courses from being indexed + if "Indexing courses" in message: + raise Exception("Simulated interruption") + + with pytest.raises(Exception, match="Simulated interruption"): + api.rebuild_index(simulated_interruption, incremental=True) + + # two more calls due to collections + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 5 + assert IncrementalIndexCompleted.objects.all().count() == 1 + api.rebuild_index(incremental=True) + assert IncrementalIndexCompleted.objects.all().count() == 0 + # one missing course indexed + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 6 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_reset_meilisearch_index(self, mock_meilisearch): + api.reset_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + api.reset_index() + mock_meilisearch.return_value.delete_index.call_count = 4 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_init_meilisearch_index(self, mock_meilisearch): + # Test index already exists + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + # Test index already exists and has no documents + mock_meilisearch.return_value.get_stats.return_value = 0 + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + mock_meilisearch.return_value.get_index.side_effect = [ + MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')), + MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')), + Mock(created_at=1), + Mock(created_at=1), + Mock(created_at=1), + ] + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + @override_settings(MEILISEARCH_ENABLED=True) @patch( "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection",