feat: catalog query filtering without elasticsearch

openedx · Sep 28, 2023 · 7290c19 · 7290c19
1 parent 44b4b17
commit 7290c19
Show file tree

Hide file tree

Showing 5 changed files with 711 additions and 0 deletions.
diff --git a/enterprise_catalog/apps/catalog/filters.py b/enterprise_catalog/apps/catalog/filters.py
@@ -0,0 +1,134 @@
+"""
+Utility functions for catalog query filtering without elasticsearch
+"""
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+SUPPORTED_FILTER_COMPARISONS = [
+    'exact',
+    'not',
+    'exclude',
+    'gt',
+    'gte',
+    'lt',
+    'lte',
+]
+
+
+class QueryFilterException(Exception):
+    """
+    An exception for content catalog query filtering
+    """
+
+
+def fix_common_query_key_mistakes(raw_query_key):
+    """
+    In production many queries have odd typos
+    which seem to have been copypasta-proliferated
+    """
+    corrections_for_typos = {
+        'aggregation_key': [
+            'aggregration__key',
+            'aggregation__key',
+        ],
+        'org__exclude': [
+            'org__exempt',
+        ],
+    }
+    for correction, typos in corrections_for_typos.items():
+        if raw_query_key in typos:
+            return correction
+    return raw_query_key
+
+
+def extract_field_and_comparison_kind(raw_query_key):
+    """
+    Taking an query key, extra the content_metadata
+    field name and the kind of comparison matching
+    should be used.
+    """
+    field = None
+    # comparison_kind defaults to "exact match"
+    comparison_kind = 'exact'
+    split_query_key = raw_query_key.split("__")
+    if len(split_query_key) == 2:
+        field, comparison_kind = split_query_key
+    elif len(split_query_key) > 2:
+        raise QueryFilterException(f'invalid syntax "{raw_query_key}"')
+    else:
+        field = raw_query_key
+    if comparison_kind not in SUPPORTED_FILTER_COMPARISONS:
+        raise QueryFilterException(f'unsupported action "{comparison_kind}" from query key "{raw_query_key}"')
+    logger.debug(f'extract_field_and_action "{raw_query_key}" -> {field}, {comparison_kind}')
+    return field, comparison_kind
+
+
+def field_comparison(query_value, content_value, comparison_kind):
+    """
+    compre the fields based on the comparison kind
+    python 3.10 has match (like switch)
+    """
+    if comparison_kind == 'exact':
+        return content_value == query_value
+    elif comparison_kind == 'not':
+        return content_value != query_value
+    elif comparison_kind == 'exclude':
+        return content_value != query_value
+    elif comparison_kind == 'gt':
+        return float(content_value) > float(query_value)
+    elif comparison_kind == 'gte':
+        return float(content_value) >= float(query_value)
+    elif comparison_kind == 'lt':
+        return float(content_value) < float(query_value)
+    elif comparison_kind == 'lte':
+        return float(content_value) <= float(query_value)
+    else:
+        raise QueryFilterException(f'invalid comparison kind "{comparison_kind}"')
+
+
+def does_query_match_content(query_dict, content_metadata_dict):
+    """
+    Evaluate a query and a content_metadata object to determine
+    if the given content_metadata and query match.
+    This is meant to partially emulate Django FieldLookups
+    for dictionaries rather than querysets.
+    https://docs.djangoproject.com/en/4.2/ref/models/querysets/#field-lookups
+    """
+    results = {}
+    for raw_query_key, query_value in query_dict.items():
+
+        query_key = fix_common_query_key_mistakes(raw_query_key)
+        field, comparison_kind = extract_field_and_comparison_kind(query_key)
+
+        if comparison_kind not in SUPPORTED_FILTER_COMPARISONS:
+            raise QueryFilterException(
+                f'unsupported comparison_kind "{comparison_kind}" '
+                f'from query key "{raw_query_key}"'
+            )
+
+        content_value = content_metadata_dict.get(field)
+        logger.debug(f'{query_key}, {field} -> {query_value}, {content_value}')
+
+        field_result = False
+        if isinstance(query_value, list):
+            field_results = []
+            for query_value_item in query_value:
+                this_field_result = field_comparison(query_value_item, content_value, comparison_kind)
+                logger.debug(f'{query_value_item}, {content_value}, {comparison_kind} -> {this_field_result}')
+                field_results.append(this_field_result)
+            # "exact" here means "IN" as in "is edx+demo IN ['edx+demo', 'mit+demo']"
+            if comparison_kind == 'exact':
+                field_result = any(field_results)
+            # else here means "NOT IN"
+            else:
+                field_result = all(field_results)
+        else:
+            field_result = field_comparison(query_value, content_value, comparison_kind)
+
+        logger.debug(f'{query_key}, {field} {comparison_kind} -> {query_value}, {content_value}, {field_result}')
+        results[field] = field_result
+    logger.debug(results)
+    return all(results.values())
diff --git a/enterprise_catalog/apps/catalog/management/commands/compare_catalog_queries_to_filters.py b/enterprise_catalog/apps/catalog/management/commands/compare_catalog_queries_to_filters.py
@@ -0,0 +1,39 @@
+import logging
+
+from django.core.management.base import BaseCommand
+
+from enterprise_catalog.apps.catalog import filters
+from enterprise_catalog.apps.catalog.models import (
+    ContentMetadata,
+    EnterpriseCatalog,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class Command(BaseCommand):
+    help = (
+        'Compare the Enterprise Catalog Query results to our own Catalog Filter'
+    )
+
+    def handle(self, *args, **options):
+        """
+        Cook it.
+        """
+        logger.info('compare_catalog_queries_to_filters starting...')
+        for content_metadata in ContentMetadata.objects.all():
+            for enterprise_catalog in EnterpriseCatalog.objects.all():
+                discovery_included = content_metadata in enterprise_catalog.content_metadata
+                match = filters.does_query_match_content(
+                    enterprise_catalog.catalog_query.content_filter,
+                    content_metadata.json_metadata
+                )
+                logger.info(
+                    'compare_catalog_queries_to_filters '
+                    f'enterprise_catalog={enterprise_catalog.uuid}, '
+                    f'content_metadata={content_metadata.content_key}, '
+                    f'discovery_included={discovery_included}, '
+                    f'filter_match={match}'
+                )
+        logger.info('compare_catalog_queries_to_filters complete.')
diff --git a/...catalog/apps/catalog/management/commands/tests/test_compare_catalog_queries_to_filters.py b/...catalog/apps/catalog/management/commands/tests/test_compare_catalog_queries_to_filters.py
@@ -0,0 +1,43 @@
+from unittest import mock
+
+from django.core.management import call_command
+from django.test import TestCase
+
+from enterprise_catalog.apps.catalog.models import (
+    CatalogQuery,
+    ContentMetadata,
+    EnterpriseCatalog,
+)
+from enterprise_catalog.apps.catalog.tests.factories import (
+    CatalogQueryFactory,
+    ContentMetadataFactory,
+    EnterpriseCatalogFactory,
+)
+
+
+class CompareCatalogQueriesToFiltersCommandTests(TestCase):
+    command_name = 'compare_catalog_queries_to_filters'
+
+    def setUp(self):
+        super().setUp()
+        self.catalog_query_c = CatalogQueryFactory(content_filter={'content_type': 'course'})
+        self.enterprise_catalog_c = EnterpriseCatalogFactory(catalog_query=self.catalog_query_c)
+        self.course_c = ContentMetadataFactory.create(content_type='course', catalog_queries=[self.catalog_query_c])
+
+    def tearDown(self):
+        super().tearDown()
+        # clean up any stale test objects
+        ContentMetadata.objects.all().delete()
+        CatalogQuery.objects.all().delete()
+        EnterpriseCatalog.objects.all().delete()
+
+    @mock.patch('enterprise_catalog.apps.catalog.filters.does_query_match_content')
+    def test_update_content_metadata_for_all_queries(
+        self, mock_does_query_match_content,
+    ):
+        """
+        Verify that the job calls the comparison with the test data
+        """
+        mock_does_query_match_content.return_value = True
+        call_command(self.command_name)
+        mock_does_query_match_content.assert_called_with(self.catalog_query_c.content_filter, self.course_c.json_metadata)
diff --git a/enterprise_catalog/apps/catalog/tests/factories.py b/enterprise_catalog/apps/catalog/tests/factories.py
@@ -79,6 +79,16 @@ class Meta:
     content_type = factory.Iterator([COURSE_RUN, COURSE, PROGRAM, LEARNER_PATHWAY])
     parent_content_key = None
 
+    @factory.post_generation
+    def catalog_queries(self, create, extracted, **kwargs):
+        # http://web.archive.org/web/20230928174535/https://factoryboy.readthedocs.io/en/latest/recipes.html#simple-many-to-many-relationship
+        if not create or not extracted:
+            # Simple build, or nothing to add, do nothing.
+            return
+
+        # Add the iterable of catalog_queries using bulk addition
+        self.catalog_queries.add(*extracted)  # pylint: disable=no-member
+
     @factory.lazy_attribute
     def json_metadata(self):
         json_metadata = {