From f2e9eca93c3acc1f0f4ed3a11722e737577f31d7 Mon Sep 17 00:00:00 2001 From: Alexander J Sheehan Date: Wed, 8 May 2024 21:14:01 +0000 Subject: [PATCH] feat: adding guard rails to content query association setter to prevent large negative deltas --- ...query-content-associations-guard-rails.rst | 38 ++++++++++ enterprise_catalog/apps/catalog/models.py | 54 +++++++++++++- .../apps/catalog/tests/test_models.py | 74 +++++++++++++++++++ enterprise_catalog/settings/base.py | 9 +++ 4 files changed, 174 insertions(+), 1 deletion(-) create mode 100644 docs/decisions/0009-query-content-associations-guard-rails.rst diff --git a/docs/decisions/0009-query-content-associations-guard-rails.rst b/docs/decisions/0009-query-content-associations-guard-rails.rst new file mode 100644 index 000000000..3b9e4f298 --- /dev/null +++ b/docs/decisions/0009-query-content-associations-guard-rails.rst @@ -0,0 +1,38 @@ +Query Content Associations Guard Rails +====================================== + +Status +------ + +Accepted + +Context +------- +We have on occasion seen moments where Discovery has reported an extreme number of incorrect content records associated +with customer queries. Almost entirely in these situations we have seen the system right itself after some time and +return the expected number of records. While ensuring for normal variance and content modification rates, we want to +catch and prevent moments where Discovery or other sources of truth erroneously reports an extreme loss or gain of +content associated with a query. + +Decision +-------- +The ``associate_content_metadata_with_query`` method will now consider the size of the existing query/content +association list as compared to the list of content to update to. Should the content association update surpass a +configurable threshold, the update to the content associations will be blocked and the old state will be retained. + +To be considered for a threshold cutoff, the requester of ``associate_content_metadata_with_query`` must meet the +following criteria: +- The existing set of content associations must exceed a configurable cutoff +- The query must have not been modified today +- The change in number of content association records must exceed a configurable percentage, both in a positive and +negative direction (ie the query loses or gains more than x% of its prior number of records) + +Consequences +------------ +Content associations will not be updated in specific cases, causing stagnant data to be displayed to customers. The +guardrail is also not loud/destructive in that it will log a warning about its action but not stop the process, just +returning the existing content metadata association set. However, given the high levels of the thresholds set, it is +intended that this only happens when the alternative of letting the content association update going through would be +more disruptive to the customer's experience than to continue using old data. It is also intended that the check on the +``modified`` field value will ensure there is a direct and easy way to allow a large content associations update to go +through. diff --git a/enterprise_catalog/apps/catalog/models.py b/enterprise_catalog/apps/catalog/models.py index 0a0df0d23..a4a0c62bb 100644 --- a/enterprise_catalog/apps/catalog/models.py +++ b/enterprise_catalog/apps/catalog/models.py @@ -850,6 +850,54 @@ def create_content_metadata(metadata, catalog_query=None, dry_run=False): return metadata_list +def _check_content_association_threshold(catalog_query, metadata_list): + """ + Helper method to check a given catalog query's content metadata association set and compare it to a new set of + metadata records. Should the two sets of records differ in size beyond a configurable percentage value, and are + evaluated to be applicable, this method returns True, indicating the threshold of difference has been met. + + Applicability for the threshold is defined as such: + - The existing set of content associations must exceed a configurable cutoff value + - The query must have not been modified today, meaning it's stale + - The change in number of content association records must exceed a configurable percentage, both in a positive and + negative direction (ie the query loses or gains more than x% of its prior number of records) + """ + existing_relations_size = catalog_query.contentmetadata_set.count() + new_relations_size = len(metadata_list) + # To prevent false positives, this content association action stop gap will only apply to reasonably sized + # content sets + if existing_relations_size > settings.CATALOG_CONTENT_INCLUSION_GUARDRAIL_CONSIDERATION_FLOOR: + # If the catalog query hasn't been modified yet today, means there's no immediate reason for such a + # large change in content associations + if catalog_query.modified.date() < localized_utcnow().date(): + # Check if the association of content results in a percentage change of + # `CATALOG_CONTENT_INCLUSION_GUARDRAIL_ALLOWABLE_DELTA` of content items from the query's content set. + percent_change = abs((new_relations_size - existing_relations_size) / existing_relations_size) + if percent_change > settings.CATALOG_CONTENT_INCLUSION_GUARDRAIL_ALLOWABLE_DELTA: + LOGGER.warning( + "[CONTENT_DELTA_WARNING] associate_content_metadata_with_query is requested to set query: " + "%s to a content metadata set of length of %s when it previous had a content metadata set length " + "of:%s. The current threshold cutoff is a delta of: %s content remaining. The update has been " + "prevented.", + catalog_query, + new_relations_size, + existing_relations_size, + settings.CATALOG_CONTENT_INCLUSION_GUARDRAIL_ALLOWABLE_DELTA, + ) + return True + elif percent_change > settings.CATALOG_CONTENT_ASSOCIATIONS_CONTENT_DELTA_WARNING_THRESHOLD: + LOGGER.warning( + "[CONTENT_DELTA_WARNING] associate_content_metadata_with_query hit the warning threshold: %s " + "while setting query: %s to a content metadata set of length of %s when it previous had a content " + "metadata set length of:%s.", + settings.CATALOG_CONTENT_ASSOCIATIONS_CONTENT_DELTA_WARNING_THRESHOLD, + catalog_query, + new_relations_size, + existing_relations_size, + ) + return False + + def associate_content_metadata_with_query(metadata, catalog_query, dry_run=False): """ Creates or updates a ContentMetadata object for each entry in `metadata`, @@ -864,7 +912,11 @@ def associate_content_metadata_with_query(metadata, catalog_query, dry_run=False list: The list of content_keys for the metadata associated with the query. """ metadata_list = create_content_metadata(metadata, catalog_query, dry_run) - + # Stop gap if the new metadata list is extremely different from the current one + if _check_content_association_threshold(catalog_query, metadata_list): + return list(content_key for content_key in catalog_query.contentmetadata_set.values_list( + 'content_key', flat=True, + )) # Setting `clear=True` will remove all prior relationships between # the CatalogQuery's associated ContentMetadata objects # before setting all new relationships from `metadata_list`. diff --git a/enterprise_catalog/apps/catalog/tests/test_models.py b/enterprise_catalog/apps/catalog/tests/test_models.py index c8e04e895..d87401066 100644 --- a/enterprise_catalog/apps/catalog/tests/test_models.py +++ b/enterprise_catalog/apps/catalog/tests/test_models.py @@ -3,6 +3,7 @@ import json from collections import OrderedDict from contextlib import contextmanager +from datetime import timedelta from unittest import mock from uuid import uuid4 @@ -23,6 +24,7 @@ update_contentmetadata_from_discovery, ) from enterprise_catalog.apps.catalog.tests import factories +from enterprise_catalog.apps.catalog.utils import localized_utcnow @ddt.ddt @@ -499,3 +501,75 @@ def test_enrollment_url_exec_ed( else: assert 'happy-little-sku' in actual_enrollment_url assert 'proxy-login' in actual_enrollment_url + + @override_settings(DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT=0) + @mock.patch('enterprise_catalog.apps.api_client.discovery.DiscoveryApiClient') + def test_associate_content_metadata_with_query_guardrails(self, mock_client): + """ + Test the limitations of the `associate_content_metadata_with_query_guardrails` and situations where content + association sets are blocked from updates + """ + # Set up our catalog and query + catalog = factories.EnterpriseCatalogFactory() + + # Mock discovery returning a single metadata record for our query + course_metadata = OrderedDict([ + ('aggregation_key', 'course:edX+testX'), + ('key', 'edX+testX'), + ('title', 'test course'), + ]) + mock_client.return_value.get_metadata_by_query.return_value = [course_metadata] + + # The catalog has no values, and falls under the guard rails minimum threshold so the update should be + # uninhibited + update_contentmetadata_from_discovery(catalog.catalog_query) + assert len(catalog.catalog_query.contentmetadata_set.all()) == 1 + + # Build an existing content metadata set that will surpass the minimum + content_metadata = [] + for x in range(100): + content_metadata.append( + factories.ContentMetadataFactory( + content_key=f'the-content-key-{x}', + content_type=COURSE, + ) + ) + catalog.catalog_query.contentmetadata_set.set(content_metadata, clear=True) + + # Move the modified to before today + catalog.catalog_query.modified -= timedelta(days=2) + # Now if we run the update, with discovery mocked to return a single item, the update will be blocked and + # retain the 100 records since the modified at of the query isn't today + update_contentmetadata_from_discovery(catalog.catalog_query) + assert len(catalog.catalog_query.contentmetadata_set.all()) == 100 + + # updates that results in a net positive change in number of content record associations will be allowed + # so long as they fall under the threshold + course_metadata_list = [] + for x in range(120): + course_metadata_list.append(OrderedDict([ + ('aggregation_key', f'course:edX+testX-{x}'), + ('key', f'edX+testX-{x}'), + ('title', f'test course-{x}'), + ])) + # Mock discovery to return 101 returns + mock_client.return_value.get_metadata_by_query.return_value = course_metadata_list + update_contentmetadata_from_discovery(catalog.catalog_query) + assert len(catalog.catalog_query.contentmetadata_set.all()) == 120 + for x in range(120): + course_metadata_list.append(OrderedDict([ + ('aggregation_key', f'course:edX+testX-{x}'), + ('key', f'edX+testX-{x}'), + ('title', f'test course-{x}'), + ])) + mock_client.return_value.get_metadata_by_query.return_value = course_metadata_list + update_contentmetadata_from_discovery(catalog.catalog_query) + # with the 120 additional records returned, that exceeds the threshold + assert len(catalog.catalog_query.contentmetadata_set.all()) == 120 + + # Now that the current contentmetadata_set is of length 120 mock discovery to return just one record again + mock_client.return_value.get_metadata_by_query.return_value = [course_metadata] + # Move the modified at time to allow the update to go through + catalog.catalog_query.modified = localized_utcnow() + update_contentmetadata_from_discovery(catalog.catalog_query) + assert len(catalog.catalog_query.contentmetadata_set.all()) == 1 diff --git a/enterprise_catalog/settings/base.py b/enterprise_catalog/settings/base.py index f4428ce64..7d4118749 100644 --- a/enterprise_catalog/settings/base.py +++ b/enterprise_catalog/settings/base.py @@ -453,3 +453,12 @@ OPENAI_API_KEY = 'I am key' ############################################## AI CURATION ############################################## + +# Decimal percentage cutoff value for queries' content metadata association setting. If the absolute percent change +# from the old set length to the new set length is greater than the setting value, the update may be prevented +CATALOG_CONTENT_INCLUSION_GUARDRAIL_ALLOWABLE_DELTA = .8 +# Decimal percentage warning value for queries' content metadata association setting. If the absolute percent change +# is greater than the setting value, the operation may trigger a warning log +CATALOG_CONTENT_ASSOCIATIONS_CONTENT_DELTA_WARNING_THRESHOLD = .5 +# Flat minimum value of content associations for a catalog query to be considered for guardrail protections +CATALOG_CONTENT_INCLUSION_GUARDRAIL_CONSIDERATION_FLOOR = 50