From 37854c1de6062f5ba896d62a7ad7a500b95f8279 Mon Sep 17 00:00:00 2001 From: rayzhou-bit Date: Wed, 11 Dec 2024 09:25:07 +0000 Subject: [PATCH] feat: async request calls --- cms/djangoapps/contentstore/tasks.py | 92 +++++++++++++++------------- 1 file changed, 50 insertions(+), 42 deletions(-) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index f10fac84df2..fddfa28dad6 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -9,6 +9,9 @@ import tarfile import re import requests +import aiohttp +import asyncio +import time from datetime import datetime from tempfile import NamedTemporaryFile, mkdtemp @@ -1107,13 +1110,6 @@ def check_broken_links(self, user_id, course_key_string, language): """ Checks for broken links in a course. Store the results in a file. """ - URL_STATUS = { - 'success': '200 OK', - 'forbidden': '403 Forbidden', - 'failure': 'Request Failed', - 'error': 'Request Error' - } - def validate_user(): """Validate if the user exists. Otherwise log error. """ try: @@ -1124,10 +1120,11 @@ def validate_user(): return def get_urls(content): - """Returns all urls after href and src in content.""" - regex = r'\s+(?:href|src)=["\']([^"\']*)["\']' - urls = re.findall(regex, content) - return urls + """Returns all urls foundafter href and src in content. + Excludes urls that are only '#'.""" + regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + url_list = re.findall(regex, content) + return url_list def is_studio_url(url): """Returns True if url is a studio url.""" @@ -1152,66 +1149,77 @@ def convert_to_standard_url(url, course_key): else: return url - def validate_url_access(url): - """Returns status of a url request.""" + async def validate_url_access(session, url_data, course_key): + """Returns status of a url request. + url_list is [id, url]""" + block_id, url = url_data + result = {'block_id': block_id, 'url': url} + standardized_url = convert_to_standard_url(url, course_key) try: - response = requests.get(url, timeout=5) - if response.status_code == 200: - return URL_STATUS['success'] - elif response.status_code == 403: - return URL_STATUS['forbidden'] - else: - return URL_STATUS['failure'] - except requests.exceptions.RequestException as e: - return URL_STATUS['error'] + async with session.get(standardized_url, timeout=5) as response: + result.update({'status': response.status}) + except Exception as e: + result.update({'status': None}) + print('error', type(e), e, url) + return result + + async def validate_urls_access(url_list, course_key): + """Returns the statuses of a list of url requests. + url_list is [block_id, url]""" + async with aiohttp.ClientSession() as session: + tasks = [validate_url_access(session, url_data, course_key) for url_data in url_list] + responses = await asyncio.gather(*tasks) + return responses def scan_course_for_links(course_key): """ Returns a list of links that are broken or locked. [block_id, link, is_locked] """ - links = [] verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) blocks = [] + links_to_validate = [] for vertical in verticals: blocks.extend(vertical.get_children()) for block in blocks: - usage_key = block.usage_key + block_id = str(block.usage_key) block_info = get_block_info(block) block_data = block_info['data'] - urls = get_urls(block_data) - - for url in urls: - if url == '#': # do not evaluate these 'url' - break - standardized_url = convert_to_standard_url(url, course_key) - status = validate_url_access(standardized_url) + url_list = get_urls(block_data) + links_to_validate += [[block_id, url] for url in url_list] - if status == URL_STATUS['failure']: - links.append([str(usage_key), url, False]) - if status == URL_STATUS['forbidden'] and is_studio_url(url): - links.append([str(usage_key), url, True]) - - return links + return links_to_validate user = validate_user() self.status.set_state('Scanning') - courselike_key = CourseKey.from_string(course_key_string) - data = scan_course_for_links(courselike_key) + course_key = CourseKey.from_string(course_key_string) + links_list = scan_course_for_links(course_key) + results = asyncio.run(validate_urls_access(links_list, course_key)) + + final_results = [] + for result in results: + if result['status'] == None: # Request error + print('retry') # TODO retry + if result['status'] == 200: # OK + print('remove from list') # TODO remove + elif result['status'] == 403 and is_studio_url(result['url']): + final_results.append([result['block_id'], result['url'], True]) + else: + final_results.append([result['block_id'], result['url'], False]) try: self.status.increment_completed_steps() - file_name = str(courselike_key) + file_name = str(course_key) links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') LOGGER.debug('json file being generated at %s', links_file.name) with open(links_file.name, 'w') as file: - json.dump(data, file, indent=4) + json.dump(final_results, file, indent=4) artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file)) @@ -1219,7 +1227,7 @@ def scan_course_for_links(course_key): # catch all exceptions so we can record useful error messages except Exception as exception: # pylint: disable=broad-except - LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True) + LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) if self.status.state != UserTaskStatus.FAILED: self.status.fail({'raw_error_msg': str(exception)}) return