From 30b511a45486c0cce3576a3924dbfd8e2c149920 Mon Sep 17 00:00:00 2001 From: Marlon Keating Date: Wed, 13 Mar 2024 23:05:26 +0000 Subject: [PATCH] feat: Add --dry-run and --no-async flags to update_content_metadata job test: Fix tests to accommodate update_content_metadata dry_run flag style: fix pycodestyle issues style: fix pylint errors fix: pr fixes + test coverage test: Add coverage for dry_run feat: add dry run logging chore: Add logging for changed catalog metadata counts feat: Add content metadata json logging for dry run --- enterprise_catalog/apps/api/tasks.py | 81 ++++++++----- .../apps/api/tests/test_tasks.py | 97 +++++++++++++++- .../tests/test_update_content_metadata.py | 46 ++++++-- .../commands/update_content_metadata.py | 86 +++++++++----- enterprise_catalog/apps/catalog/models.py | 78 ++++++++----- .../apps/catalog/tests/test_models.py | 107 ++++++++++++++++++ 6 files changed, 392 insertions(+), 103 deletions(-) diff --git a/enterprise_catalog/apps/api/tasks.py b/enterprise_catalog/apps/api/tasks.py index c24669f9b..85cdbadbe 100644 --- a/enterprise_catalog/apps/api/tasks.py +++ b/enterprise_catalog/apps/api/tasks.py @@ -226,7 +226,7 @@ class LoggedTaskWithRetry(LoggedTask): # pylint: disable=abstract-method @shared_task(base=LoggedTaskWithRetry, bind=True, default_retry_delay=UNREADY_TASK_RETRY_COUNTDOWN_SECONDS) @expiring_task_semaphore() -def update_full_content_metadata_task(self, force=False): # pylint: disable=unused-argument +def update_full_content_metadata_task(self, force=False, dry_run=False): # pylint: disable=unused-argument """ Looks up the full metadata from discovery's `/api/v1/courses` and `/api/v1/programs` endpoints to pad all ContentMetadata objects. The metadata is merged with the existing contents @@ -237,9 +237,9 @@ def update_full_content_metadata_task(self, force=False): # pylint: disable=unu """ content_keys = [metadata.content_key for metadata in ContentMetadata.objects.filter(content_type=COURSE)] - _update_full_content_metadata_course(content_keys) + _update_full_content_metadata_course(content_keys, dry_run) content_keys = [metadata.content_key for metadata in ContentMetadata.objects.filter(content_type=PROGRAM)] - _update_full_content_metadata_program(content_keys) + _update_full_content_metadata_program(content_keys, dry_run) def _find_best_mode_seat(seats): @@ -308,7 +308,7 @@ def _normalize_course_metadata(course_metadata_record): json_meta['normalized_metadata'] = normalized_metadata -def _update_full_content_metadata_course(content_keys): +def _update_full_content_metadata_course(content_keys, dry_run=False): """ Given content_keys, finds the associated ContentMetadata records with a type of course and looks up the full course metadata from discovery's /api/v1/courses endpoint to pad the ContentMetadata objects. The course @@ -385,13 +385,20 @@ def _update_full_content_metadata_course(content_keys): course_metadata_dict.get('programs', []), metadata_record, ) - _update_full_content_metadata_program(program_content_keys) - - ContentMetadata.objects.bulk_update( - modified_content_metadata_records, - ['json_metadata'], - batch_size=10, - ) + _update_full_content_metadata_program(program_content_keys, dry_run) + if dry_run: + logger.info('[Dry Run] Updated content metadata json for {}: {}'.format( + content_key, json.dumps(metadata_record.json_metadata) + )) + + if dry_run: + logger.info('dry_run=True, not updating course metadata') + else: + ContentMetadata.objects.bulk_update( + modified_content_metadata_records, + ['json_metadata'], + batch_size=10, + ) logger.info( 'Successfully updated %d of %d ContentMetadata records with full metadata from course-discovery.', @@ -408,7 +415,7 @@ def _update_full_content_metadata_course(content_keys): ) -def _update_full_content_metadata_program(content_keys): +def _update_full_content_metadata_program(content_keys, dry_run=False): """ Given content_keys, finds the associated ContentMetadata records with a type of program and looks up the full program metadata from discovery's /api/v1/programs endpoint to pad the ContentMetadata objects. The program @@ -454,11 +461,14 @@ def _update_full_content_metadata_program(content_keys): metadata_record.json_metadata.update(program_metadata_dict) modified_content_metadata_records.append(metadata_record) - ContentMetadata.objects.bulk_update( - modified_content_metadata_records, - ['json_metadata'], - batch_size=10, - ) + if dry_run: + logger.info('dry_run=true, not updating program metadata') + else: + ContentMetadata.objects.bulk_update( + modified_content_metadata_records, + ['json_metadata'], + batch_size=10, + ) logger.info( 'Successfully updated %d of %d ContentMetadata records with full metadata from course-discovery.', @@ -1084,7 +1094,7 @@ def _reindex_algolia(indexable_content_keys, nonindexable_content_keys, dry_run= @shared_task(base=LoggedTaskWithRetry, bind=True) @expiring_task_semaphore() -def update_catalog_metadata_task(self, catalog_query_id, force=False): # pylint: disable=unused-argument +def update_catalog_metadata_task(self, catalog_query_id, force=False, dry_run=False): # pylint: disable=unused-argument """ Associates ContentMetadata objects with the appropriate catalog query by pulling data from /search/all on discovery. @@ -1103,7 +1113,7 @@ def update_catalog_metadata_task(self, catalog_query_id, force=False): # pylint logger.error('Could not find a CatalogQuery with id %s', catalog_query_id) try: - associated_content_keys = update_contentmetadata_from_discovery(catalog_query) + associated_content_keys = update_contentmetadata_from_discovery(catalog_query, dry_run) except Exception as e: logger.exception( f'Something went wrong while updating content metadata from discovery using catalog: {catalog_query_id} ' @@ -1120,7 +1130,7 @@ def update_catalog_metadata_task(self, catalog_query_id, force=False): # pylint @shared_task(base=LoggedTaskWithRetry, bind=True) @expiring_task_semaphore() -def fetch_missing_course_metadata_task(self, force=False): # pylint: disable=unused-argument +def fetch_missing_course_metadata_task(self, force=False, dry_run=False): # pylint: disable=unused-argument """ Creates a CatalogQuery for all the courses that do not have ContentMetadata instance. @@ -1155,7 +1165,7 @@ def fetch_missing_course_metadata_task(self, force=False): # pylint: disable=un defaults={'content_filter': content_filter, 'title': None}, ) - associated_content_keys = update_contentmetadata_from_discovery(catalog_query) + associated_content_keys = update_contentmetadata_from_discovery(catalog_query, dry_run) logger.info('[FETCH_MISSING_METADATA] Finished fetch_missing_course_metadata_task with {} associated content ' 'keys for catalog {}'.format(len(associated_content_keys), catalog_query.id)) else: @@ -1164,7 +1174,7 @@ def fetch_missing_course_metadata_task(self, force=False): # pylint: disable=un @shared_task(base=LoggedTaskWithRetry, bind=True) @expiring_task_semaphore() -def fetch_missing_pathway_metadata_task(self, force=False): # pylint: disable=unused-argument +def fetch_missing_pathway_metadata_task(self, force=False, dry_run=False): # pylint: disable=unused-argument """ Creates ContentMetadata for Learner Pathways and all its associates. @@ -1186,7 +1196,7 @@ def fetch_missing_pathway_metadata_task(self, force=False): # pylint: disable=u content_filter_hash=get_content_filter_hash(content_filter), defaults={'content_filter': content_filter, 'title': None}, ) - associated_content_keys = update_contentmetadata_from_discovery(catalog_query) + associated_content_keys = update_contentmetadata_from_discovery(catalog_query, dry_run) logger.info( '[FETCH_MISSING_METADATA] Finished Pathways fetch_missing_pathway_metadata_task with {} associated content ' 'keys for catalog {}'.format( @@ -1222,7 +1232,7 @@ def fetch_missing_pathway_metadata_task(self, force=False): # pylint: disable=u defaults={'content_filter': content_filter, 'title': None}, ) - associated_content_keys = update_contentmetadata_from_discovery(catalog_query) + associated_content_keys = update_contentmetadata_from_discovery(catalog_query, dry_run) logger.info( '[FETCH_MISSING_METADATA] Finished programs fetch_missing_pathway_metadata_task with {} keys for ' 'catalog {}'.format( @@ -1250,7 +1260,7 @@ def fetch_missing_pathway_metadata_task(self, force=False): # pylint: disable=u defaults={'content_filter': content_filter, 'title': None}, ) - associated_content_keys = update_contentmetadata_from_discovery(catalog_query) + associated_content_keys = update_contentmetadata_from_discovery(catalog_query, dry_run) logger.info( '[FETCH_MISSING_METADATA] Finished courses fetch_missing_pathway_metadata_task with {} keys for ' 'catalog {}'.format( @@ -1266,13 +1276,22 @@ def fetch_missing_pathway_metadata_task(self, force=False): # pylint: disable=u associated_content_metadata = ContentMetadata.objects.filter( content_key__in=program_uuids + course_keys ) - pathway.associated_content_metadata.set(associated_content_metadata) - logger.info( - '[FETCH_MISSING_METADATA] Learner Pathway {} associated created. No. of associations: {}'.format( - pathway.content_key, - pathway.associated_content_metadata.count(), + if dry_run: + logger.info( + ('[FETCH_MISSING_METADATA][Dry Run] Learner Pathway {} associations created.' + 'No. of associations: {}').format( + pathway.content_key, + pathway.associated_content_metadata.count(), + ) + ) + else: + pathway.associated_content_metadata.set(associated_content_metadata) + logger.info( + '[FETCH_MISSING_METADATA] Learner Pathway {} associated created. No. of associations: {}'.format( + pathway.content_key, + pathway.associated_content_metadata.count(), + ) ) - ) else: pathway.associated_content_metadata.clear() diff --git a/enterprise_catalog/apps/api/tests/test_tasks.py b/enterprise_catalog/apps/api/tests/test_tasks.py index 9b3e96c8d..ce7c3324b 100644 --- a/enterprise_catalog/apps/api/tests/test_tasks.py +++ b/enterprise_catalog/apps/api/tests/test_tasks.py @@ -193,8 +193,8 @@ def test_update_catalog_metadata(self, mock_update_data_from_discovery): """ Assert update_catalog_metadata_task is called with correct catalog_query_id """ - tasks.update_catalog_metadata_task.apply(args=(self.catalog_query.id,)) - mock_update_data_from_discovery.assert_called_with(self.catalog_query) + tasks.update_catalog_metadata_task.apply(args=(self.catalog_query.id, False, False)) + mock_update_data_from_discovery.assert_called_with(self.catalog_query, False) @mock.patch('enterprise_catalog.apps.api.tasks.update_contentmetadata_from_discovery') def test_update_catalog_metadata_no_catalog_query(self, mock_update_data_from_discovery): @@ -235,7 +235,7 @@ def test_fetch_missing_course_metadata_task(self, mock_update_data_from_discover assert catalog_query.content_filter['content_type'] == 'course' assert catalog_query.content_filter['key'] == [test_course] - mock_update_data_from_discovery.assert_called_with(catalog_query) + mock_update_data_from_discovery.assert_called_with(catalog_query, False) @ddt.ddt @@ -538,6 +538,48 @@ def test_update_full_metadata_program(self, mock_oauth_client, mock_partition_pr assert metadata_1.json_metadata == program_data_1 assert metadata_2.json_metadata == program_data_2 + @mock.patch('enterprise_catalog.apps.api.tasks.partition_program_keys_for_indexing') + @mock.patch('enterprise_catalog.apps.api_client.base_oauth.OAuthAPIClient') + def test_update_full_metadata_program_dry_run(self, mock_oauth_client, mock_partition_program_keys): + """ + Assert that during dry run full program metadata is not merged with original json_metadata + """ + program_key_1 = '02f5edeb-6604-4131-bf45-acd8df91e1f9' + program_data_1 = {'uuid': program_key_1, 'full_program_only_field': 'test_1'} + program_key_2 = 'be810df3-a059-42a7-b11f-d9bfb2877b15' + program_data_2 = {'uuid': program_key_2, 'full_program_only_field': 'test_2'} + + # Mock out the data that should be returned from discovery's /api/v1/programs endpoint + mock_oauth_client.return_value.get.return_value.json.return_value = { + 'results': [program_data_1, program_data_2], + } + mock_partition_program_keys.return_value = ([], [],) + + metadata_1 = ContentMetadataFactory(content_type=PROGRAM, content_key=program_key_1) + metadata_1.catalog_queries.set([self.catalog_query]) + metadata_2 = ContentMetadataFactory(content_type=PROGRAM, content_key=program_key_2) + metadata_2.catalog_queries.set([self.catalog_query]) + + assert metadata_1.json_metadata != program_data_1 + assert metadata_2.json_metadata != program_data_2 + + tasks.update_full_content_metadata_task.apply(kwargs={'dry_run': True}).get() + + actual_program_keys_args = mock_partition_program_keys.call_args_list[0][0][0] + self.assertEqual(set(actual_program_keys_args), {metadata_1, metadata_2}) + + metadata_1 = ContentMetadata.objects.get(content_key='02f5edeb-6604-4131-bf45-acd8df91e1f9') + metadata_2 = ContentMetadata.objects.get(content_key='be810df3-a059-42a7-b11f-d9bfb2877b15') + + # Validate original json_metadata still in place after dry run + program_data_1.update(metadata_1.json_metadata) + program_data_2.update(metadata_2.json_metadata) + program_data_1.update({'aggregation_key': 'program:02f5edeb-6604-4131-bf45-acd8df91e1f9'}) + program_data_2.update({'aggregation_key': 'program:be810df3-a059-42a7-b11f-d9bfb2877b15'}) + + assert metadata_1.json_metadata != program_data_1 + assert metadata_2.json_metadata != program_data_2 + # pylint: disable=unused-argument @mock.patch('enterprise_catalog.apps.api.tasks.task_recently_run', return_value=False) @mock.patch('enterprise_catalog.apps.api.tasks.partition_program_keys_for_indexing') @@ -2097,3 +2139,52 @@ def test_update_full_content_metadata_course( self.assertEqual(mock_update_content_metadata_program.call_count, 2) self.assertEqual(mock_create_course_associated_programs.call_count, 2) + + @mock.patch('enterprise_catalog.apps.api.tasks._fetch_courses_by_keys') + @mock.patch('enterprise_catalog.apps.api.tasks.DiscoveryApiClient.get_course_reviews') + @mock.patch('enterprise_catalog.apps.api.tasks.ContentMetadata.objects.filter') + @mock.patch('enterprise_catalog.apps.api.tasks.create_course_associated_programs') + @mock.patch('enterprise_catalog.apps.api.tasks._update_full_content_metadata_program') + def test_update_full_content_metadata_course_dry_run( + self, + mock_update_content_metadata_program, + mock_create_course_associated_programs, + mock_filter, + mock_get_course_reviews, + mock_fetch_courses_by_keys + ): + # Mock data + content_keys = ['course1', 'course2'] + full_course_dicts = [ + {'key': 'course1', 'title': 'Course 1'}, + {'key': 'course2', 'title': 'Course 2'} + ] + reviews_for_courses_dict = { + 'course1': {'reviews_count': 10, 'avg_course_rating': 4.5}, + 'course2': {'reviews_count': 5, 'avg_course_rating': 3.8} + } + content_metadata_1 = ContentMetadataFactory(content_type=COURSE, content_key='course1') + content_metadata_2 = ContentMetadataFactory(content_type=COURSE, content_key='course2') + metadata_records_for_fetched_keys = [content_metadata_1, content_metadata_2] + + # Configure mock objects + mock_fetch_courses_by_keys.return_value = full_course_dicts + mock_get_course_reviews.return_value = reviews_for_courses_dict + mock_filter.return_value = metadata_records_for_fetched_keys + + # Call the function + tasks._update_full_content_metadata_course(content_keys, dry_run=True) # pylint: disable=protected-access + + mock_fetch_courses_by_keys.assert_called_once_with(content_keys) + mock_get_course_reviews.assert_called_once_with(['course1', 'course2']) + mock_filter.assert_called_once_with(content_key__in=['course1', 'course2']) + + content_metadata_1.refresh_from_db() + content_metadata_2.refresh_from_db() + assert content_metadata_1.json_metadata.get('reviews_count') is None + assert content_metadata_1.json_metadata.get('avg_course_rating') is None + assert content_metadata_2.json_metadata.get('reviews_count') is None + assert content_metadata_2.json_metadata.get('avg_course_rating') is None + + self.assertEqual(mock_update_content_metadata_program.call_count, 2) + self.assertEqual(mock_create_course_associated_programs.call_count, 2) diff --git a/enterprise_catalog/apps/catalog/management/commands/tests/test_update_content_metadata.py b/enterprise_catalog/apps/catalog/management/commands/tests/test_update_content_metadata.py index 6628238b8..faa5a6f78 100644 --- a/enterprise_catalog/apps/catalog/management/commands/tests/test_update_content_metadata.py +++ b/enterprise_catalog/apps/catalog/management/commands/tests/test_update_content_metadata.py @@ -58,14 +58,14 @@ def test_update_content_metadata_for_all_queries( Verify that the job creates an update task for every catalog query """ call_command(self.command_name) - assert mock_fetch_missing_pathway.si.call_args._get_call_arguments()[1] == {"force": False} - assert mock_fetch_missing_course.si.call_args._get_call_arguments()[1] == {"force": False} + assert mock_fetch_missing_pathway.si.call_args._get_call_arguments()[1] == {"force": False, "dry_run": False} + assert mock_fetch_missing_course.si.call_args._get_call_arguments()[1] == {"force": False, "dry_run": False} mock_group.assert_called_once_with([ - mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=False), - mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=False, dry_run=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=False, dry_run=False), ]) - mock_full_metadata_task.si.assert_called_once_with(force=False) + mock_full_metadata_task.si.assert_called_once_with(force=False, dry_run=False) @mock.patch( 'enterprise_catalog.apps.catalog.management.commands.update_content_metadata.fetch_missing_course_metadata_task') @@ -89,10 +89,10 @@ def test_update_content_metadata_for_filtered_queries( mock_fetch_missing_pathway.si.assert_called() mock_fetch_missing_course.si.assert_called() mock_group.assert_called_once_with([ - mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=False), - mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=False, dry_run=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=False, dry_run=False), ]) - mock_full_metadata_task.si.assert_called_once_with(force=False) + mock_full_metadata_task.si.assert_called_once_with(force=False, dry_run=False) @mock.patch( 'enterprise_catalog.apps.catalog.management.commands.update_content_metadata.fetch_missing_course_metadata_task') @@ -108,9 +108,31 @@ def test_force_update_content_metadata( Verify that the job creates an update task for every catalog query """ call_command(self.command_name, force=True) - assert mock_fetch_missing_pathway.si.call_args._get_call_arguments()[1] == {"force": True} + assert mock_fetch_missing_pathway.si.call_args._get_call_arguments()[1] == {"force": True, "dry_run": False} mock_group.assert_called_once_with([ - mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=True), - mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=True), + mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=True, dry_run=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=True, dry_run=False), ]) - mock_full_metadata_task.si.assert_called_once_with(force=True) + mock_full_metadata_task.si.assert_called_once_with(force=True, dry_run=False) + + @mock.patch( + 'enterprise_catalog.apps.catalog.management.commands.update_content_metadata.fetch_missing_course_metadata_task') + @mock.patch( + 'enterprise_catalog.apps.catalog.management.commands.update_content_metadata.fetch_missing_pathway_metadata_task') + @mock.patch('enterprise_catalog.apps.catalog.management.commands.update_content_metadata.group') + @mock.patch('enterprise_catalog.apps.catalog.management.commands.update_content_metadata.update_catalog_metadata_task') + @mock.patch('enterprise_catalog.apps.catalog.management.commands.update_content_metadata.update_full_content_metadata_task') + def test_update_content_metadata_no_async( + self, mock_full_metadata_task, mock_catalog_task, mock_group, mock_fetch_missing_pathway, mock_fetch_missing_course + ): + """ + Verify that the tasks are executed synchronously when --no-async flag is set + """ + call_command(self.command_name, force=True, no_async=True) + mock_fetch_missing_pathway.apply.assert_called_once_with(kwargs={"force": True, "dry_run": False}) + mock_fetch_missing_course.apply.assert_called_once_with(kwargs={"force": True, "dry_run": False}) + mock_group.assert_called_once_with([ + mock_catalog_task.s(catalog_query_id=self.catalog_query_a, force=True, dry_run=False), + mock_catalog_task.s(catalog_query_id=self.catalog_query_b, force=True, dry_run=False), + ]) + mock_full_metadata_task.apply.assert_called_once_with(kwargs={"force": True, "dry_run": False}) diff --git a/enterprise_catalog/apps/catalog/management/commands/update_content_metadata.py b/enterprise_catalog/apps/catalog/management/commands/update_content_metadata.py index a61741637..2a564c45c 100644 --- a/enterprise_catalog/apps/catalog/management/commands/update_content_metadata.py +++ b/enterprise_catalog/apps/catalog/management/commands/update_content_metadata.py @@ -24,29 +24,36 @@ class Command(BaseCommand): 'Updates Content Metadata, along with the associations of Catalog Queries and Content Metadata.' ) - def _update_catalog_metadata_task(self, catalog_query, force=False): - message = ( + def _update_catalog_metadata_task(self, catalog_query, no_async, **kwargs): + async_message = ( 'Spinning off update_catalog_metadata_task from update_content_metadata command' ' to update content_metadata for catalog query %s.' ) - logger.info(message, catalog_query) - return update_catalog_metadata_task.s(catalog_query.id, force=force) + if not no_async: + logger.info(async_message, catalog_query) + return update_catalog_metadata_task.s(catalog_query.id, **kwargs) - def _fetch_missing_course_metadata_task(self, force=False): + def _fetch_missing_course_metadata_task_async(self, **kwargs): logger.info( 'Spinning off fetch_missing_course_metadata_task from update_content_metadata command' ' to update content_metadata of missing courses.' ) - return fetch_missing_course_metadata_task.si(force=force) + return fetch_missing_course_metadata_task.si(**kwargs).apply_async().get( + timeout=TASK_TIMEOUT, + propagate=True, + ) - def _fetch_missing_pathway_metadata_task(self, force=False): + def _fetch_missing_pathway_metadata_task_async(self, **kwargs): logger.info( 'Spinning off fetch_missing_pathway_metadata_task from update_content_metadata command' ' to update content_metadata of missing pathways.' ) - return fetch_missing_pathway_metadata_task.si(force=force) + return fetch_missing_pathway_metadata_task.si(**kwargs).apply_async().get( + timeout=TASK_TIMEOUT, + propagate=True, + ) - def _update_full_content_metadata_task(self, *args, **kwargs): + def _update_full_content_metadata_task_async(self, **kwargs): """ Returns a task signature for the `update_full_content_metadata_task`. @@ -61,7 +68,10 @@ def _update_full_content_metadata_task(self, *args, **kwargs): # task.si() is used as a shortcut for an immutable signature to avoid calling this with the results from the # previously run `update_catalog_metadata_task`. # https://docs.celeryproject.org/en/master/userguide/canvas.html#immutability - return update_full_content_metadata_task.si(force=kwargs.get('force', False)) + return update_full_content_metadata_task.si(**kwargs).apply_async().get( + timeout=TASK_TIMEOUT, + propagate=True, + ) def add_arguments(self, parser): # Argument to force execution of celery task, ignoring time since last execution @@ -74,6 +84,20 @@ def add_arguments(self, parser): 'if a record is present and enabled.' ), ) + parser.add_argument( + '--dry-run', + dest='dry_run', + default=False, + action='store_true', + help='Generate algolia products to index, but do not actually send them to algolia for indexing.', + ) + parser.add_argument( + '--no-async', + dest='no_async', + default=False, + action='store_true', + help='Run the task synchronously (without celery).', + ) def handle(self, *args, **options): """ @@ -82,17 +106,15 @@ def handle(self, *args, **options): """ options.update(CatalogUpdateCommandConfig.current_options()) - # Fetch program metadata for the programs that are missing. - self._fetch_missing_pathway_metadata_task(force=options['force']).apply_async().get( - timeout=TASK_TIMEOUT, - propagate=True, - ) - - # Fetch course metadata for the courses that are missing. - self._fetch_missing_course_metadata_task(force=options['force']).apply_async().get( - timeout=TASK_TIMEOUT, - propagate=True, - ) + no_async = options.get('no_async', False) + flags = {k: options.get(k, False) for k in ('force', 'dry_run')} + # Fetch metadata for the courses/programs that are missing. + if no_async: + fetch_missing_pathway_metadata_task.apply(kwargs=flags) + fetch_missing_course_metadata_task.apply(kwargs=flags) + else: + self._fetch_missing_pathway_metadata_task_async(**flags) + self._fetch_missing_course_metadata_task_async(**flags) # find all CatalogQuery records used by at least one EnterpriseCatalog to avoid # calling /search/all/ for a CatalogQuery that is not currently used by any catalogs. @@ -116,15 +138,18 @@ def handle(self, *args, **options): # https://docs.celeryproject.org/en/v5.0.5/userguide/canvas.html update_group = group( [ - self._update_catalog_metadata_task(catalog_query, force=options['force']) + self._update_catalog_metadata_task(catalog_query, no_async, **flags) for catalog_query in catalog_queries ] ) try: - update_group_result = update_group.apply_async().get( - timeout=TASK_TIMEOUT, - propagate=True, - ) + if no_async: + update_group_result = update_group.apply(kwargs=flags) + else: + update_group_result = update_group.apply_async().get( + timeout=TASK_TIMEOUT, + propagate=True, + ) logger.info( 'Finished doing catalog metadata update related to {} CatalogQueries'.format(len(update_group_result)) ) @@ -146,11 +171,10 @@ def handle(self, *args, **options): ) try: - full_update_task = self._update_full_content_metadata_task(force=options['force']) - full_update_result = full_update_task.apply_async().get( - timeout=TASK_TIMEOUT, - propagate=True, - ) + if no_async: + update_full_content_metadata_task.apply(kwargs=flags) + else: + self._update_full_content_metadata_task_async(**flags) logger.info('Finished doing full update of metadata records.') except Exception as exc: # See comment above about celery exception prefixes. diff --git a/enterprise_catalog/apps/catalog/models.py b/enterprise_catalog/apps/catalog/models.py index 6083c81cf..0a0df0d23 100644 --- a/enterprise_catalog/apps/catalog/models.py +++ b/enterprise_catalog/apps/catalog/models.py @@ -693,7 +693,7 @@ def _partition_content_metadata_defaults(batched_metadata, existing_metadata_by_ return existing_metadata_defaults, nonexisting_metadata_defaults -def _update_existing_content_metadata(existing_metadata_defaults, existing_metadata_by_key): +def _update_existing_content_metadata(existing_metadata_defaults, existing_metadata_by_key, dry_run=False): """ Iterates through existing ContentMetadata database objects, updating the values of various fields based on the defaults provided. @@ -703,6 +703,7 @@ def _update_existing_content_metadata(existing_metadata_defaults, existing_metad to update the existing ContentMetadata database objects. existing_metadata_by_key (dict): Dictionary of existing ContentMetadata database objects to update by content_key. + dry_run (boolean): Logs rather than commits updated content metadata Returns: list: List of ContentMetadata objects that were updated. @@ -720,24 +721,29 @@ def _update_existing_content_metadata(existing_metadata_defaults, existing_metad setattr(content_metadata, key, value) metadata_list.append(content_metadata) - metadata_fields_to_update = ['content_key', 'parent_content_key', 'content_type', 'json_metadata'] - batch_size = settings.UPDATE_EXISTING_CONTENT_METADATA_BATCH_SIZE - for batched_metadata in batch(metadata_list, batch_size=batch_size): - try: - ContentMetadata.objects.bulk_update( - batched_metadata, - metadata_fields_to_update, - batch_size=batch_size, - ) - except OperationalError: - content_keys = [record.content_key for record in batched_metadata] - log_message = 'Operational error while updating batch of ContentMetadata objects with keys: %s' - LOGGER.exception(log_message, content_keys) - raise + if dry_run: + LOGGER.info(f"[Dry Run] Number of Content Metadata records that would have been updated: {len(metadata_list)}") + for metadata in metadata_list: + LOGGER.info(f"[Dry Run] Skipping Content Metadata update: {metadata}") + else: + metadata_fields_to_update = ['content_key', 'parent_content_key', 'content_type', 'json_metadata'] + batch_size = settings.UPDATE_EXISTING_CONTENT_METADATA_BATCH_SIZE + for batched_metadata in batch(metadata_list, batch_size=batch_size): + try: + ContentMetadata.objects.bulk_update( + batched_metadata, + metadata_fields_to_update, + batch_size=batch_size, + ) + except OperationalError: + content_keys = [record.content_key for record in batched_metadata] + log_message = 'Operational error while updating batch of ContentMetadata objects with keys: %s' + LOGGER.exception(log_message, content_keys) + raise return metadata_list -def _create_new_content_metadata(nonexisting_metadata_defaults): +def _create_new_content_metadata(nonexisting_metadata_defaults, dry_run=False): """ Creates new ContentMetadata database objects based on the defaults provided. This is done through an atomic database transaction. @@ -745,15 +751,20 @@ def _create_new_content_metadata(nonexisting_metadata_defaults): Arguments: nonexisting_metadata_defaults (list): List of default values for various fields to create non-existing ContentMetadata database objects. + dry_run (boolean): Logs rather than commits newly-created content metadata. Returns: - list: List of ContentMetadata objects that were created. + list: List of ContentMetadata objects that were created (or logged if dry_run=True). """ metadata_list = [] try: with transaction.atomic(): for defaults in nonexisting_metadata_defaults: - content_metadata = ContentMetadata.objects.create(**defaults) + if dry_run: + content_metadata = ContentMetadata(**defaults) + LOGGER.info(f"Created {content_metadata}") + else: + content_metadata = ContentMetadata.objects.create(**defaults) metadata_list.append(content_metadata) except IntegrityError: LOGGER.exception('_create_new_content_metadata ran into an issue while creating new ContentMetadata objects.') @@ -797,13 +808,14 @@ def _should_allow_metadata(metadata_entry, catalog_query=None): return False -def create_content_metadata(metadata, catalog_query=None): +def create_content_metadata(metadata, catalog_query=None, dry_run=False): """ Creates or updates a ContentMetadata object. Arguments: metadata (list): List of content metadata dictionaries. catalog_query (CatalogQuery): Catalog Query object. + dry_run (boolean): Logs rather than commits content metadata additions. Returns: list: The list of ContentMetaData. @@ -824,17 +836,21 @@ def create_content_metadata(metadata, catalog_query=None): ) # Update existing ContentMetadata records - updated_metadata = _update_existing_content_metadata(existing_metadata_defaults, existing_metadata_by_key) + updated_metadata = _update_existing_content_metadata( + existing_metadata_defaults, + existing_metadata_by_key, + dry_run + ) metadata_list.extend(updated_metadata) # Create new ContentMetadata records - created_metadata = _create_new_content_metadata(nonexisting_metadata_defaults) + created_metadata = _create_new_content_metadata(nonexisting_metadata_defaults, dry_run) metadata_list.extend(created_metadata) return metadata_list -def associate_content_metadata_with_query(metadata, catalog_query): +def associate_content_metadata_with_query(metadata, catalog_query, dry_run=False): """ Creates or updates a ContentMetadata object for each entry in `metadata`, and then associates that object with the `catalog_query` provided. @@ -842,17 +858,26 @@ def associate_content_metadata_with_query(metadata, catalog_query): Arguments: metadata (list): List of content metadata dictionaries. catalog_query (CatalogQuery): CatalogQuery object + dry_run (boolean): Logs rather than commits updated content metadata. Returns: list: The list of content_keys for the metadata associated with the query. """ - metadata_list = create_content_metadata(metadata, catalog_query) + metadata_list = create_content_metadata(metadata, catalog_query, dry_run) # Setting `clear=True` will remove all prior relationships between # the CatalogQuery's associated ContentMetadata objects # before setting all new relationships from `metadata_list`. # https://docs.djangoproject.com/en/2.2/ref/models/relations/#django.db.models.fields.related.RelatedManager.set - catalog_query.contentmetadata_set.set(metadata_list, clear=True) + if dry_run: + old_metadata_count = catalog_query.contentmetadata_set.count() + new_metadata_count = len(metadata_list) + if old_metadata_count != new_metadata_count: + LOGGER.info('[Dry Run] Updated metadata count ({} -> {}) for {}'.format( + old_metadata_count, new_metadata_count, catalog_query)) + else: + catalog_query.contentmetadata_set.set(metadata_list, clear=True) + associated_content_keys = [metadata.content_key for metadata in metadata_list] return associated_content_keys @@ -944,7 +969,7 @@ def __repr__(self): return self.__str__() -def update_contentmetadata_from_discovery(catalog_query): +def update_contentmetadata_from_discovery(catalog_query, dry_run=False): """ Takes a CatalogQuery, uses cache or the Discovery API client to retrieve associated metadata, and then creates/updates ContentMetadata objects. @@ -954,6 +979,7 @@ def update_contentmetadata_from_discovery(catalog_query): Args: catalog_query (CatalogQuery): The catalog query to pass to discovery's /search/all endpoint. + dry_run (boolean): Logs rather than commits updated content metadata. Returns: list of str: Returns the content keys that were associated from the query results. """ @@ -977,7 +1003,7 @@ def update_contentmetadata_from_discovery(catalog_query): catalog_query, ) - associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query) + associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query, dry_run) LOGGER.info( 'Associated %d content items (%d unique) with catalog query %s', len(associated_content_keys), diff --git a/enterprise_catalog/apps/catalog/tests/test_models.py b/enterprise_catalog/apps/catalog/tests/test_models.py index eaa5c736e..c8e04e895 100644 --- a/enterprise_catalog/apps/catalog/tests/test_models.py +++ b/enterprise_catalog/apps/catalog/tests/test_models.py @@ -292,6 +292,113 @@ def test_contentmetadata_update_from_discovery_ignore_exec_ed(self, mock_client) assert course_run_cm not in associated_metadata assert program_cm in associated_metadata + @override_settings(DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT=0) + @mock.patch('enterprise_catalog.apps.api_client.discovery.DiscoveryApiClient') + def test_contentmetadata_update_from_discovery_dry_run_create(self, mock_client): + """ + update_contentmetadata_from_discovery should not create ContentMetadata + objects from the discovery service /search/all api call under dry_run option + """ + course_metadata = OrderedDict([ + ('aggregation_key', 'course:edX+testX'), + ('key', 'edX+testX'), + ('title', 'test course'), + ]) + course_run_metadata = OrderedDict([ + ('aggregation_key', 'courserun:edX+testX'), + ('key', 'course-v1:edX+testX+1'), + ('title', 'test course run'), + ]) + program_metadata = OrderedDict([ + ('aggregation_key', 'program:c7d546f2-a442-49d2-8ef1-4cb64f46df88'), + ('title', 'test program'), + ('uuid', '6e8e47ed-28d8-4861-917e-cedca1135a3f'), + ]) + mock_client.return_value.get_metadata_by_query.return_value = [ + course_metadata, + course_run_metadata, + program_metadata, + ] + catalog = factories.EnterpriseCatalogFactory() + + self.assertEqual(ContentMetadata.objects.count(), 0) + update_contentmetadata_from_discovery(catalog.catalog_query, dry_run=True) + mock_client.assert_called_once() + self.assertEqual(ContentMetadata.objects.count(), 0) + + @override_settings(DISCOVERY_CATALOG_QUERY_CACHE_TIMEOUT=0) + @mock.patch('enterprise_catalog.apps.api_client.discovery.DiscoveryApiClient') + def test_contentmetadata_update_from_discovery_dry_run_update(self, mock_client): + """ + update_contentmetadata_from_discovery should not update ContentMetadata + objects from the discovery service /search/all api call under dry_run option + """ + course_metadata = OrderedDict([ + ('aggregation_key', 'course:edX+testX'), + ('key', 'edX+testX'), + ('title', 'test course'), + ]) + course_run_metadata = OrderedDict([ + ('aggregation_key', 'courserun:edX+testX'), + ('key', 'course-v1:edX+testX+1'), + ('title', 'test course run'), + ]) + program_metadata = OrderedDict([ + ('aggregation_key', 'program:c7d546f2-a442-49d2-8ef1-4cb64f46df88'), + ('title', 'test program'), + ('uuid', '6e8e47ed-28d8-4861-917e-cedca1135a3f'), + ]) + mock_client.return_value.get_metadata_by_query.return_value = [ + course_metadata, + course_run_metadata, + program_metadata, + ] + catalog = factories.EnterpriseCatalogFactory() + + self.assertEqual(ContentMetadata.objects.count(), 0) + update_contentmetadata_from_discovery(catalog.catalog_query) + mock_client.assert_called_once() + self.assertEqual(ContentMetadata.objects.count(), 3) + + associated_metadata = catalog.content_metadata + + # Assert stored content metadata is correct for each type + course_cm = ContentMetadata.objects.get(content_key=course_metadata['key']) + self.assertEqual(course_cm.content_type, COURSE) + self.assertEqual(course_cm.parent_content_key, None) + self.assertEqual(course_cm.json_metadata, course_metadata) + assert course_cm in associated_metadata + + course_run_cm = ContentMetadata.objects.get(content_key=course_run_metadata['key']) + self.assertEqual(course_run_cm.content_type, COURSE_RUN) + self.assertEqual(course_run_cm.parent_content_key, course_metadata['key']) + self.assertEqual(course_run_cm.json_metadata, course_run_metadata) + assert course_run_cm in associated_metadata + + program_cm = ContentMetadata.objects.get(content_key=program_metadata['uuid']) + self.assertEqual(program_cm.content_type, PROGRAM) + self.assertEqual(program_cm.parent_content_key, None) + self.assertEqual(program_cm.json_metadata, program_metadata) + assert program_cm in associated_metadata + + # Run again with existing ContentMetadata database objects, temporarily modifying + # the json_metadata of the existing course to remove a field that would later be + # added in the dry_run=False case, verifying it doesn't update in the dry_run=True case + course_cm = ContentMetadata.objects.get(content_key=course_metadata['key']) + course_cm.json_metadata = { + 'key': course_metadata['key'], + 'title': course_metadata['title'], + } + course_cm.save() + update_contentmetadata_from_discovery(catalog.catalog_query, dry_run=True) + self.assertEqual(ContentMetadata.objects.count(), 3) # assert all ContentMetadata objects are preserved + course_cm = ContentMetadata.objects.get(content_key=course_metadata['key']) + # assert json_metadata is not updated to include fields plucked from /search/all metadata. + self.assertNotEqual( + json.dumps(course_cm.json_metadata, sort_keys=True), + json.dumps(course_metadata, sort_keys=True), + ) + @contextmanager def _mock_enterprise_customer_cache( self,