From d4b273164a3d50b649aad47c67f13db6a20a709b Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Fri, 6 Dec 2024 00:26:55 +0000 Subject: [PATCH 1/5] feat: locked link --- .../core/course_optimizer_provider.py | 4 +- cms/djangoapps/contentstore/tasks.py | 62 +++++++++++-------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/cms/djangoapps/contentstore/core/course_optimizer_provider.py b/cms/djangoapps/contentstore/core/course_optimizer_provider.py index c24ed45b1a02..9a6ecfdaebeb 100644 --- a/cms/djangoapps/contentstore/core/course_optimizer_provider.py +++ b/cms/djangoapps/contentstore/core/course_optimizer_provider.py @@ -54,8 +54,8 @@ def generate_broken_links_descriptor(json_content, request_user): xblock_dictionary = {} # dictionary of xblock attributes for item in json_content: - block_id, link, *is_locked = item - is_locked_flag = bool(is_locked) + block_id, link, *rest = item + is_locked_flag = bool(rest[0]) usage_key = usage_key_with_run(block_id) block = get_xblock(usage_key, request_user) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 00b39a4bda2a..f10fac84df2f 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1107,6 +1107,13 @@ def check_broken_links(self, user_id, course_key_string, language): """ Checks for broken links in a course. Store the results in a file. """ + URL_STATUS = { + 'success': '200 OK', + 'forbidden': '403 Forbidden', + 'failure': 'Request Failed', + 'error': 'Request Error' + } + def validate_user(): """Validate if the user exists. Otherwise log error. """ try: @@ -1121,16 +1128,20 @@ def get_urls(content): regex = r'\s+(?:href|src)=["\']([^"\']*)["\']' urls = re.findall(regex, content) return urls + + def is_studio_url(url): + """Returns True if url is a studio url.""" + return not url.startswith('http://') and not url.startswith('https://') def convert_to_standard_url(url, course_key): """ - Returns standard urls when given studio urls. + Returns standard urls when given studio urls. Otherwise return url as is. Example urls: /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png /static/getting-started_x250.png /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7 """ - if not url.startswith('http://') and not url.startswith('https://'): + if is_studio_url(url): if url.startswith('/static/'): processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1] return 'http://' + settings.CMS_BASE + processed_url @@ -1138,36 +1149,29 @@ def convert_to_standard_url(url, course_key): return 'http://' + settings.CMS_BASE + url else: return 'http://' + settings.CMS_BASE + '/container/' + url + else: + return url - def check_url(url): - """Returns SUCCESS, ACCESS_DENIED, or FAILURE after checking url request""" + def validate_url_access(url): + """Returns status of a url request.""" try: response = requests.get(url, timeout=5) if response.status_code == 200: - return "SUCCESS" + return URL_STATUS['success'] elif response.status_code == 403: - return "ACCESS_DENIED" + return URL_STATUS['forbidden'] else: - return "FAILURE" - except requests.exceptions.RequestException as e: - return "FAILURE" - - def verify_url(url): - """Returns true if url request returns 200""" - try: - response = requests.get(url, timeout=5) - return response.status_code == 200 + return URL_STATUS['failure'] except requests.exceptions.RequestException as e: - return False + return URL_STATUS['error'] - def scan_course(course_key): + def scan_course_for_links(course_key): """ - Scans the course and returns broken link tuples. - [, ] + Returns a list of links that are broken or locked. + [block_id, link, is_locked] """ - broken_links = [] + links = [] verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) - print('verticals: ', verticals) blocks = [] for vertical in verticals: @@ -1180,20 +1184,24 @@ def scan_course(course_key): urls = get_urls(block_data) for url in urls: - if url == '#': + if url == '#': # do not evaluate these 'url' break + standardized_url = convert_to_standard_url(url, course_key) - # TODO if check_url ACCESS_DENIED and is studio link, it's locked - if not verify_url(standardized_url): - broken_links.append([str(usage_key), url]) + status = validate_url_access(standardized_url) + + if status == URL_STATUS['failure']: + links.append([str(usage_key), url, False]) + if status == URL_STATUS['forbidden'] and is_studio_url(url): + links.append([str(usage_key), url, True]) - return broken_links + return links user = validate_user() self.status.set_state('Scanning') courselike_key = CourseKey.from_string(course_key_string) - data = scan_course(courselike_key) + data = scan_course_for_links(courselike_key) try: self.status.increment_completed_steps() From 37854c1de6062f5ba896d62a7ad7a500b95f8279 Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Wed, 11 Dec 2024 09:25:07 +0000 Subject: [PATCH 2/5] feat: async request calls --- cms/djangoapps/contentstore/tasks.py | 92 +++++++++++++++------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index f10fac84df2f..fddfa28dad61 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -9,6 +9,9 @@ import tarfile import re import requests +import aiohttp +import asyncio +import time from datetime import datetime from tempfile import NamedTemporaryFile, mkdtemp @@ -1107,13 +1110,6 @@ def check_broken_links(self, user_id, course_key_string, language): """ Checks for broken links in a course. Store the results in a file. """ - URL_STATUS = { - 'success': '200 OK', - 'forbidden': '403 Forbidden', - 'failure': 'Request Failed', - 'error': 'Request Error' - } - def validate_user(): """Validate if the user exists. Otherwise log error. """ try: @@ -1124,10 +1120,11 @@ def validate_user(): return def get_urls(content): - """Returns all urls after href and src in content.""" - regex = r'\s+(?:href|src)=["\']([^"\']*)["\']' - urls = re.findall(regex, content) - return urls + """Returns all urls foundafter href and src in content. + Excludes urls that are only '#'.""" + regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + url_list = re.findall(regex, content) + return url_list def is_studio_url(url): """Returns True if url is a studio url.""" @@ -1152,66 +1149,77 @@ def convert_to_standard_url(url, course_key): else: return url - def validate_url_access(url): - """Returns status of a url request.""" + async def validate_url_access(session, url_data, course_key): + """Returns status of a url request. + url_list is [id, url]""" + block_id, url = url_data + result = {'block_id': block_id, 'url': url} + standardized_url = convert_to_standard_url(url, course_key) try: - response = requests.get(url, timeout=5) - if response.status_code == 200: - return URL_STATUS['success'] - elif response.status_code == 403: - return URL_STATUS['forbidden'] - else: - return URL_STATUS['failure'] - except requests.exceptions.RequestException as e: - return URL_STATUS['error'] + async with session.get(standardized_url, timeout=5) as response: + result.update({'status': response.status}) + except Exception as e: + result.update({'status': None}) + print('error', type(e), e, url) + return result + + async def validate_urls_access(url_list, course_key): + """Returns the statuses of a list of url requests. + url_list is [block_id, url]""" + async with aiohttp.ClientSession() as session: + tasks = [validate_url_access(session, url_data, course_key) for url_data in url_list] + responses = await asyncio.gather(*tasks) + return responses def scan_course_for_links(course_key): """ Returns a list of links that are broken or locked. [block_id, link, is_locked] """ - links = [] verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) blocks = [] + links_to_validate = [] for vertical in verticals: blocks.extend(vertical.get_children()) for block in blocks: - usage_key = block.usage_key + block_id = str(block.usage_key) block_info = get_block_info(block) block_data = block_info['data'] - urls = get_urls(block_data) - - for url in urls: - if url == '#': # do not evaluate these 'url' - break - standardized_url = convert_to_standard_url(url, course_key) - status = validate_url_access(standardized_url) + url_list = get_urls(block_data) + links_to_validate += [[block_id, url] for url in url_list] - if status == URL_STATUS['failure']: - links.append([str(usage_key), url, False]) - if status == URL_STATUS['forbidden'] and is_studio_url(url): - links.append([str(usage_key), url, True]) - - return links + return links_to_validate user = validate_user() self.status.set_state('Scanning') - courselike_key = CourseKey.from_string(course_key_string) - data = scan_course_for_links(courselike_key) + course_key = CourseKey.from_string(course_key_string) + links_list = scan_course_for_links(course_key) + results = asyncio.run(validate_urls_access(links_list, course_key)) + + final_results = [] + for result in results: + if result['status'] == None: # Request error + print('retry') # TODO retry + if result['status'] == 200: # OK + print('remove from list') # TODO remove + elif result['status'] == 403 and is_studio_url(result['url']): + final_results.append([result['block_id'], result['url'], True]) + else: + final_results.append([result['block_id'], result['url'], False]) try: self.status.increment_completed_steps() - file_name = str(courselike_key) + file_name = str(course_key) links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') LOGGER.debug('json file being generated at %s', links_file.name) with open(links_file.name, 'w') as file: - json.dump(data, file, indent=4) + json.dump(final_results, file, indent=4) artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file)) @@ -1219,7 +1227,7 @@ def scan_course_for_links(course_key): # catch all exceptions so we can record useful error messages except Exception as exception: # pylint: disable=broad-except - LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True) + LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) if self.status.state != UserTaskStatus.FAILED: self.status.fail({'raw_error_msg': str(exception)}) return From 0cfa08853369c3a64eec828dcf22f6059c39b7cb Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Wed, 11 Dec 2024 17:49:38 +0000 Subject: [PATCH 3/5] feat: batching --- cms/djangoapps/contentstore/tasks.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index fddfa28dad61..914838a67f56 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1160,16 +1160,24 @@ async def validate_url_access(session, url_data, course_key): result.update({'status': response.status}) except Exception as e: result.update({'status': None}) - print('error', type(e), e, url) + print('[Validate url error]', type(e), e, url) return result - async def validate_urls_access(url_list, course_key): + async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): """Returns the statuses of a list of url requests. url_list is [block_id, url]""" - async with aiohttp.ClientSession() as session: - tasks = [validate_url_access(session, url_data, course_key) for url_data in url_list] - responses = await asyncio.gather(*tasks) - return responses + responses = [] + url_count = len(url_list) + + for i in range(0, url_count, batch_size): + batch = url_list[i:i + batch_size] + async with aiohttp.ClientSession() as session: + tasks = [validate_url_access(session, url_data, course_key) for url_data in batch] + batch_results = await asyncio.gather(*tasks) + responses.extend(batch_results) + print(f'batch {i // batch_size+1} of {url_count // batch_size + 1}') + + return responses def scan_course_for_links(course_key): """ @@ -1198,7 +1206,7 @@ def scan_course_for_links(course_key): self.status.set_state('Scanning') course_key = CourseKey.from_string(course_key_string) links_list = scan_course_for_links(course_key) - results = asyncio.run(validate_urls_access(links_list, course_key)) + results = asyncio.run(validate_urls_access_in_batches(links_list, course_key, batch_size=100)) final_results = [] for result in results: From 57040df1502210d6ac42fe220ba8c9d38e471911 Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Thu, 12 Dec 2024 20:30:00 +0000 Subject: [PATCH 4/5] feat: retry urls with connection errors --- cms/djangoapps/contentstore/tasks.py | 127 +++++++++++++++++---------- 1 file changed, 80 insertions(+), 47 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 914838a67f56..59d7aabd1a8e 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1120,8 +1120,10 @@ def validate_user(): return def get_urls(content): - """Returns all urls foundafter href and src in content. - Excludes urls that are only '#'.""" + """ + Returns all urls found after href and src in content. + Excludes urls that are only '#'. + """ regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' url_list = re.findall(regex, content) return url_list @@ -1149,9 +1151,33 @@ def convert_to_standard_url(url, course_key): else: return url + def scan_course_for_links(course_key): + """ + Returns a list of all urls in a course. + Returns: [ [block_id1, url1], [block_id2, url2], ... ] + """ + verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) + blocks = [] + urls_to_validate = [] + + for vertical in verticals: + blocks.extend(vertical.get_children()) + + for block in blocks: + block_id = str(block.usage_key) + block_info = get_block_info(block) + block_data = block_info['data'] + + url_list = get_urls(block_data) + urls_to_validate += [[block_id, url] for url in url_list] + + return urls_to_validate + async def validate_url_access(session, url_data, course_key): - """Returns status of a url request. - url_list is [id, url]""" + """ + Returns the status of a url request + Returns: {block_id1, url1, status} + """ block_id, url = url_data result = {'block_id': block_id, 'url': url} standardized_url = convert_to_standard_url(url, course_key) @@ -1160,12 +1186,14 @@ async def validate_url_access(session, url_data, course_key): result.update({'status': response.status}) except Exception as e: result.update({'status': None}) - print('[Validate url error]', type(e), e, url) + LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') return result async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): - """Returns the statuses of a list of url requests. - url_list is [block_id, url]""" + """ + Returns the statuses of a list of url requests. + Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] + """ responses = [] url_count = len(url_list) @@ -1175,67 +1203,72 @@ async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): tasks = [validate_url_access(session, url_data, course_key) for url_data in batch] batch_results = await asyncio.gather(*tasks) responses.extend(batch_results) - print(f'batch {i // batch_size+1} of {url_count // batch_size + 1}') + LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}') return responses - def scan_course_for_links(course_key): + def filter_by_status(results): """ - Returns a list of links that are broken or locked. - [block_id, link, is_locked] + Filter results by status. + 200: OK. No need to do more + 403: Forbidden. Record as locked link. + None: Error. Retry up to 3 times. + Other: Failure. Record as broken link. + Returns: + filtered_results: [ [block_id1, url1, is_locked], ... ] + retry_list: [ [block_id1, url1], ... ] """ - verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) - blocks = [] - links_to_validate = [] - - for vertical in verticals: - blocks.extend(vertical.get_children()) - - for block in blocks: - block_id = str(block.usage_key) - block_info = get_block_info(block) - block_data = block_info['data'] - - url_list = get_urls(block_data) - links_to_validate += [[block_id, url] for url in url_list] - - return links_to_validate + filtered_results = [] + retry_list = [] + for result in results: + if result['status'] == None: + retry_list.append([result['block_id'], result['url']]) + elif result['status'] == 200: + continue + elif result['status'] == 403 and is_studio_url(result['url']): + filtered_results.append([result['block_id'], result['url'], True]) + else: + filtered_results.append([result['block_id'], result['url'], False]) + + return filtered_results, retry_list user = validate_user() self.status.set_state('Scanning') course_key = CourseKey.from_string(course_key_string) - links_list = scan_course_for_links(course_key) - results = asyncio.run(validate_urls_access_in_batches(links_list, course_key, batch_size=100)) - - final_results = [] - for result in results: - if result['status'] == None: # Request error - print('retry') # TODO retry - if result['status'] == 200: # OK - print('remove from list') # TODO remove - elif result['status'] == 403 and is_studio_url(result['url']): - final_results.append([result['block_id'], result['url'], True]) - else: - final_results.append([result['block_id'], result['url'], False]) + url_list = scan_course_for_links(course_key) + validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100)) + broken_or_locked_urls, retry_list = filter_by_status(validated_url_list) + + # Retry urls that failed due to connection error + retry_count = 3 + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i+1}') + retry_validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) + retry_results, retry_list = filter_by_status(retry_validated_url_list) + broken_or_locked_urls.extend(retry_results) + + if retry_list: + LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') try: self.status.increment_completed_steps() file_name = str(course_key) - links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') - LOGGER.debug('json file being generated at %s', links_file.name) + broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') + LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}') - with open(links_file.name, 'w') as file: - json.dump(final_results, file, indent=4) + with open(broken_links_file.name, 'w') as file: + json.dump(broken_or_locked_urls, file, indent=4) artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') - artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file)) + artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file)) artifact.save() # catch all exceptions so we can record useful error messages - except Exception as exception: # pylint: disable=broad-except + except Exception as e: # pylint: disable=broad-except LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) if self.status.state != UserTaskStatus.FAILED: - self.status.fail({'raw_error_msg': str(exception)}) + self.status.fail({'raw_error_msg': str(e)}) return From 1df96b02f63ae66059c11ed8a79b6412c35ca259 Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Sat, 14 Dec 2024 00:46:30 +0000 Subject: [PATCH 5/5] feat: refactor retry_validation --- cms/djangoapps/contentstore/tasks.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 59d7aabd1a8e..451bc9221ea0 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -1206,6 +1206,22 @@ async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}') return responses + + def retry_validation(url_list, course_key, retry_count=3): + """Retry urls that failed due to connection error.""" + results = [] + retry_list = url_list + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i+1}') + validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) + filetered_url_list, retry_list = filter_by_status(validated_url_list) + results.extend(filetered_url_list) + + if retry_list: + LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') + + return results def filter_by_status(results): """ @@ -1239,18 +1255,10 @@ def filter_by_status(results): url_list = scan_course_for_links(course_key) validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100)) broken_or_locked_urls, retry_list = filter_by_status(validated_url_list) - - # Retry urls that failed due to connection error - retry_count = 3 - for i in range(0, retry_count): - if retry_list: - LOGGER.debug(f'[Link Check] retry attempt #{i+1}') - retry_validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) - retry_results, retry_list = filter_by_status(retry_validated_url_list) - broken_or_locked_urls.extend(retry_results) if retry_list: - LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') + retry_results = retry_validation(retry_list, course_key, retry_count=3) + broken_or_locked_urls.extend(retry_results) try: self.status.increment_completed_steps()