diff --git a/enterprise_catalog/apps/catalog/filters.py b/enterprise_catalog/apps/catalog/filters.py new file mode 100644 index 000000000..5d8e19ea0 --- /dev/null +++ b/enterprise_catalog/apps/catalog/filters.py @@ -0,0 +1,134 @@ +""" +Utility functions for catalog query filtering without elasticsearch +""" +import logging + + +logger = logging.getLogger(__name__) + + +SUPPORTED_FILTER_COMPARISONS = [ + 'exact', + 'not', + 'exclude', + 'gt', + 'gte', + 'lt', + 'lte', +] + + +class QueryFilterException(Exception): + """ + An exception for content catalog query filtering + """ + + +def fix_common_query_key_mistakes(raw_query_key): + """ + In production many queries have odd typos + which seem to have been copypasta-proliferated + """ + corrections_for_typos = { + 'aggregation_key': [ + 'aggregration__key', + 'aggregation__key', + ], + 'org__exclude': [ + 'org__exempt', + ], + } + for correction, typos in corrections_for_typos.items(): + if raw_query_key in typos: + return correction + return raw_query_key + + +def extract_field_and_comparison_kind(raw_query_key): + """ + Taking an query key, extra the content_metadata + field name and the kind of comparison matching + should be used. + """ + field = None + # comparison_kind defaults to "exact match" + comparison_kind = 'exact' + split_query_key = raw_query_key.split("__") + if len(split_query_key) == 2: + field, comparison_kind = split_query_key + elif len(split_query_key) > 2: + raise QueryFilterException(f'invalid syntax "{raw_query_key}"') + else: + field = raw_query_key + if comparison_kind not in SUPPORTED_FILTER_COMPARISONS: + raise QueryFilterException(f'unsupported action "{comparison_kind}" from query key "{raw_query_key}"') + logger.debug(f'extract_field_and_action "{raw_query_key}" -> {field}, {comparison_kind}') + return field, comparison_kind + + +def field_comparison(query_value, content_value, comparison_kind): + """ + compre the fields based on the comparison kind + python 3.10 has match (like switch) + """ + if comparison_kind == 'exact': + return content_value == query_value + elif comparison_kind == 'not': + return content_value != query_value + elif comparison_kind == 'exclude': + return content_value != query_value + elif comparison_kind == 'gt': + return float(content_value) > float(query_value) + elif comparison_kind == 'gte': + return float(content_value) >= float(query_value) + elif comparison_kind == 'lt': + return float(content_value) < float(query_value) + elif comparison_kind == 'lte': + return float(content_value) <= float(query_value) + else: + raise QueryFilterException(f'invalid comparison kind "{comparison_kind}"') + + +def does_query_match_content(query_dict, content_metadata_dict): + """ + Evaluate a query and a content_metadata object to determine + if the given content_metadata and query match. + This is meant to partially emulate Django FieldLookups + for dictionaries rather than querysets. + https://docs.djangoproject.com/en/4.2/ref/models/querysets/#field-lookups + """ + results = {} + for raw_query_key, query_value in query_dict.items(): + + query_key = fix_common_query_key_mistakes(raw_query_key) + field, comparison_kind = extract_field_and_comparison_kind(query_key) + + if comparison_kind not in SUPPORTED_FILTER_COMPARISONS: + raise QueryFilterException( + f'unsupported comparison_kind "{comparison_kind}" ' + f'from query key "{raw_query_key}"' + ) + + content_value = content_metadata_dict.get(field) + logger.debug(f'{query_key}, {field} -> {query_value}, {content_value}') + + field_result = False + if isinstance(query_value, list): + field_results = [] + for query_value_item in query_value: + this_field_result = field_comparison(query_value_item, content_value, comparison_kind) + logger.debug(f'{query_value_item}, {content_value}, {comparison_kind} -> {this_field_result}') + field_results.append(this_field_result) + # "exact" here means "IN" as in "is edx+demo IN ['edx+demo', 'mit+demo']" + if comparison_kind == 'exact': + field_result = any(field_results) + # else here means "NOT IN" + else: + field_result = all(field_results) + else: + field_result = field_comparison(query_value, content_value, comparison_kind) + + logger.debug(f'{query_key}, {field} {comparison_kind} -> {query_value}, {content_value}, {field_result}') + results[field] = field_result + logger.debug(results) + return all(results.values()) diff --git a/enterprise_catalog/apps/catalog/management/commands/compare_catalog_queries_to_filters.py b/enterprise_catalog/apps/catalog/management/commands/compare_catalog_queries_to_filters.py new file mode 100644 index 000000000..4d0a87baa --- /dev/null +++ b/enterprise_catalog/apps/catalog/management/commands/compare_catalog_queries_to_filters.py @@ -0,0 +1,39 @@ +import logging + +from django.core.management.base import BaseCommand + +from enterprise_catalog.apps.catalog import filters +from enterprise_catalog.apps.catalog.models import ( + ContentMetadata, + EnterpriseCatalog, +) + + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = ( + 'Compare the Enterprise Catalog Query results to our own Catalog Filter' + ) + + def handle(self, *args, **options): + """ + Cook it. + """ + logger.info('compare_catalog_queries_to_filters starting...') + for content_metadata in ContentMetadata.objects.all(): + for enterprise_catalog in EnterpriseCatalog.objects.all(): + discovery_included = content_metadata in enterprise_catalog.content_metadata + match = filters.does_query_match_content( + enterprise_catalog.catalog_query.content_filter, + content_metadata.json_metadata + ) + logger.info( + 'compare_catalog_queries_to_filters ' + f'enterprise_catalog={enterprise_catalog.uuid}, ' + f'content_metadata={content_metadata.content_key}, ' + f'discovery_included={discovery_included}, ' + f'filter_match={match}' + ) + logger.info('compare_catalog_queries_to_filters complete.') diff --git a/enterprise_catalog/apps/catalog/management/commands/tests/test_compare_catalog_queries_to_filters.py b/enterprise_catalog/apps/catalog/management/commands/tests/test_compare_catalog_queries_to_filters.py new file mode 100644 index 000000000..d33e30864 --- /dev/null +++ b/enterprise_catalog/apps/catalog/management/commands/tests/test_compare_catalog_queries_to_filters.py @@ -0,0 +1,43 @@ +from unittest import mock + +from django.core.management import call_command +from django.test import TestCase + +from enterprise_catalog.apps.catalog.models import ( + CatalogQuery, + ContentMetadata, + EnterpriseCatalog, +) +from enterprise_catalog.apps.catalog.tests.factories import ( + CatalogQueryFactory, + ContentMetadataFactory, + EnterpriseCatalogFactory, +) + + +class CompareCatalogQueriesToFiltersCommandTests(TestCase): + command_name = 'compare_catalog_queries_to_filters' + + def setUp(self): + super().setUp() + self.catalog_query_c = CatalogQueryFactory(content_filter={'content_type': 'course'}) + self.enterprise_catalog_c = EnterpriseCatalogFactory(catalog_query=self.catalog_query_c) + self.course_c = ContentMetadataFactory.create(content_type='course', catalog_queries=[self.catalog_query_c]) + + def tearDown(self): + super().tearDown() + # clean up any stale test objects + ContentMetadata.objects.all().delete() + CatalogQuery.objects.all().delete() + EnterpriseCatalog.objects.all().delete() + + @mock.patch('enterprise_catalog.apps.catalog.filters.does_query_match_content') + def test_update_content_metadata_for_all_queries( + self, mock_does_query_match_content, + ): + """ + Verify that the job calls the comparison with the test data + """ + mock_does_query_match_content.return_value = True + call_command(self.command_name) + mock_does_query_match_content.assert_called_with(self.catalog_query_c.content_filter, self.course_c.json_metadata) diff --git a/enterprise_catalog/apps/catalog/tests/factories.py b/enterprise_catalog/apps/catalog/tests/factories.py index 85be5b7d3..fec6868dc 100644 --- a/enterprise_catalog/apps/catalog/tests/factories.py +++ b/enterprise_catalog/apps/catalog/tests/factories.py @@ -79,6 +79,16 @@ class Meta: content_type = factory.Iterator([COURSE_RUN, COURSE, PROGRAM, LEARNER_PATHWAY]) parent_content_key = None + @factory.post_generation + def catalog_queries(self, create, extracted, **kwargs): + # http://web.archive.org/web/20230928174535/https://factoryboy.readthedocs.io/en/latest/recipes.html#simple-many-to-many-relationship + if not create or not extracted: + # Simple build, or nothing to add, do nothing. + return + + # Add the iterable of catalog_queries using bulk addition + self.catalog_queries.add(*extracted) # pylint: disable=no-member + @factory.lazy_attribute def json_metadata(self): json_metadata = { diff --git a/enterprise_catalog/apps/catalog/tests/test_filters.py b/enterprise_catalog/apps/catalog/tests/test_filters.py new file mode 100644 index 000000000..e8beffd89 --- /dev/null +++ b/enterprise_catalog/apps/catalog/tests/test_filters.py @@ -0,0 +1,485 @@ +""" Tests for catalog query filtering. """ +import json +import logging + +import ddt +import pytest +from django.test import TestCase + +from enterprise_catalog.apps.catalog import filters + + +logger = logging.getLogger(__name__) + + +@ddt.ddt +class QueryFilterTests(TestCase): + """ + Tests for filtering content metadata based on queries without elasticsearch + """ + + @ddt.data( + { + 'raw_query_key': 'status', + 'expected_field': 'status', + 'expected_comparison_kind': 'exact' + }, + { + 'raw_query_key': 'aggregation_key__exclude', + 'expected_field': 'aggregation_key', + 'expected_comparison_kind': 'exclude' + }, + ) + @ddt.unpack + def test_extract_field_and_comparison_kind(self, raw_query_key, expected_field, expected_comparison_kind): + extracted_field, extracted_comparison_kind = filters.extract_field_and_comparison_kind(raw_query_key) + self.assertEqual(extracted_field, expected_field) + self.assertEqual(extracted_comparison_kind, expected_comparison_kind) + + @ddt.data( + {'raw_query_key': 'status__deeper__field'}, + {'raw_query_key': 'aggregation_key__notreal'}, + ) + @ddt.unpack + def test_invalid_extract_field_and_comparison_kind(self, raw_query_key): + with pytest.raises(filters.QueryFilterException): + filters.extract_field_and_comparison_kind(raw_query_key) + + def test_invalid_query_key(self): + query_json = """ + { + "content_type":"course", + "aggregation_key__notreal":"course:MITx+6.002.3x" + } + """ + content_metadata_json = """ + { + "aggregation_key": "course:MITx+6.002.3x", + "content_type": "course" + } + """ + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + with pytest.raises(filters.QueryFilterException): + filters.does_query_match_content(query_data, content_metadata) + + def test_matching_exclude_list(self): + """ + A matching query using an exclude list + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__exclude":[ + "course:edX+DemoX.1", + "course:MITx+6.002.1x", + "course:MITx+6.002.2x", + "course:MITx+6.002.3x" + ] + } + """ + + content_metadata_json = """ + { + "aggregation_key": "course:StellenboschX+AMDP.1", + "content_type": "course", + "key": "StellenboschX+AMDP.1", + "title": "Freedom of expression in the African media and digital policy landscape", + "card_image_url": null, + "image_url": "https://prod-discovery.edx-cdn.org/media/course/image/3a836be6-9d49-4a2b-99f3-05a38def865b-f7efe0348a13.small.jpeg" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_exclude_list(self): + """ + A non-matching query using an exclude list. + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__exclude":[ + "course:edX+DemoX.1", + "course:MITx+6.002.1x", + "course:MITx+6.002.2x", + "course:MITx+6.002.3x" + ] + } + """ + + content_metadata_json = """ + { + "aggregation_key": "course:MITx+6.002.3x", + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_exclude_key(self): + """ + A non-matching query using an exclude key. + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__exclude":"course:MITx+6.002.3x" + } + """ + + content_metadata_json = """ + { + "aggregation_key": "course:MITx+6.002.3x", + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_matching_missing_exclude(self): + """ + A matching query where an exclude references a missing key (valid). + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__exclude":"course:MITx+6.002.3x" + } + """ + + content_metadata_json = """ + { + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_matching_not_key(self): + """ + A matching query using a not key. + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__not":"course:MITx+6.002.3x" + } + """ + + content_metadata_json = """ + { + "aggregation_key": "course:edX+DemoX.1", + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_not_key(self): + """ + A non-matching query using an not key. + """ + query_json = """ + { + "content_type":"course", + "aggregation_key__not":"course:MITx+6.002.3x" + } + """ + + content_metadata_json = """ + { + "aggregation_key": "course:MITx+6.002.3x", + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_matching_lte_key(self): + """ + A non-matching query using an lte key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__lte":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 301, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_lte_key(self): + """ + A non-matching query using an lte key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__lte":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 302, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_matching_lt_key(self): + """ + A non-matching query using an lt key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__lt":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 300, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_lt_key(self): + """ + A non-matching query using an lt key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__lt":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 301, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_matching_gte_key(self): + """ + A non-matching query using an gte key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__gte":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 301, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_gte_key(self): + """ + A non-matching query using an gte key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__gte":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 300, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_matching_gt_key(self): + """ + A non-matching query using an gt key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__gt":"300" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 301, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_matching_gt_key(self): + """ + A non-matching query using an gt key. + """ + query_json = """ + { + "content_type":"course", + "first_enrollable_paid_seat_price__gt":"301" + } + """ + + content_metadata_json = """ + { + "first_enrollable_paid_seat_price": 301, + "content_type": "course" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata) + + def test_exact_list(self): + """ + A matching query using a list exact key (aka include) + """ + + query_json = """ + { + "partner":"edx", + "status":[ + "published", + "active" + ], + "content_type":[ + "learnerpathway", + "course" + ], + "include_learner_pathways":"True", + "aggregation_key":[ + "learnerpathway:786bbe57-e06c-4eee-92f4-49087fccc200", + "learnerpathway:88a35038-d8d4-400e-a1dd-2d9e28d7740b", + "learnerpathway:01deb04e-8965-4a77-b6b3-30c3b1a6e81d", + "learnerpathway:bb233836-e6ae-4521-9727-91da454e0276", + "learnerpathway:5202ccd8-ea91-4da2-8a23-bcf2c612074d", + "learnerpathway:0610ee5a-2e78-4209-a47b-a2b6aae91a7f", + "learnerpathway:339994f8-1b3e-480c-9a92-a43e8f4db82f", + "learnerpathway:9e6607f2-d02e-4823-968d-02b0484a5a38", + "learnerpathway:77bca285-7891-46e1-ab17-d4d4ecb34f92", + "learnerpathway:46df6bad-6751-466a-9cbd-13cedab75403", + "learnerpathway:a4176150-e3a5-4229-8be6-7198d7f16221", + "learnerpathway:08d1928c-3a2a-4d0b-8cd6-566a722d3e4f", + "learnerpathway:c7fafa58-7b67-4c69-82fe-e4098fbeb48c", + "learnerpathway:5ad9fd53-93e1-46f9-8b17-ddc0102c1594" + ] + } + """ + + content_metadata_json = """ + { + "status":"published", + "partner":"edx", + "content_type": "course", + "include_learner_pathways":"True", + "aggregation_key": "learnerpathway:786bbe57-e06c-4eee-92f4-49087fccc200" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert filters.does_query_match_content(query_data, content_metadata) + + def test_non_match_exact_list(self): + """ + A matching query using a list exact key (aka include) + """ + + query_json = """ + { + "partner":"edx", + "status":[ + "published", + "active" + ], + "content_type":[ + "learnerpathway", + "course" + ], + "include_learner_pathways":"True", + "aggregation_key":[ + "learnerpathway:786bbe57-e06c-4eee-92f4-49087fccc200", + "learnerpathway:88a35038-d8d4-400e-a1dd-2d9e28d7740b", + "learnerpathway:01deb04e-8965-4a77-b6b3-30c3b1a6e81d", + "learnerpathway:bb233836-e6ae-4521-9727-91da454e0276", + "learnerpathway:5202ccd8-ea91-4da2-8a23-bcf2c612074d", + "learnerpathway:0610ee5a-2e78-4209-a47b-a2b6aae91a7f", + "learnerpathway:339994f8-1b3e-480c-9a92-a43e8f4db82f", + "learnerpathway:9e6607f2-d02e-4823-968d-02b0484a5a38", + "learnerpathway:77bca285-7891-46e1-ab17-d4d4ecb34f92", + "learnerpathway:46df6bad-6751-466a-9cbd-13cedab75403", + "learnerpathway:a4176150-e3a5-4229-8be6-7198d7f16221", + "learnerpathway:08d1928c-3a2a-4d0b-8cd6-566a722d3e4f", + "learnerpathway:c7fafa58-7b67-4c69-82fe-e4098fbeb48c", + "learnerpathway:5ad9fd53-93e1-46f9-8b17-ddc0102c1594" + ] + } + """ + + content_metadata_json = """ + { + "status":"published", + "partner":"edx", + "content_type": "course", + "include_learner_pathways":"True", + "aggregation_key": "course:MITx+6.002.3x" + } + """ + + query_data = json.loads(query_json) + content_metadata = json.loads(content_metadata_json) + assert not filters.does_query_match_content(query_data, content_metadata)