diff --git a/cms/djangoapps/contentstore/core/course_optimizer_provider.py b/cms/djangoapps/contentstore/core/course_optimizer_provider.py index c24ed45b1a02..9a6ecfdaebeb 100644 --- a/cms/djangoapps/contentstore/core/course_optimizer_provider.py +++ b/cms/djangoapps/contentstore/core/course_optimizer_provider.py @@ -54,8 +54,8 @@ def generate_broken_links_descriptor(json_content, request_user): xblock_dictionary = {} # dictionary of xblock attributes for item in json_content: - block_id, link, *is_locked = item - is_locked_flag = bool(is_locked) + block_id, link, *rest = item + is_locked_flag = bool(rest[0]) usage_key = usage_key_with_run(block_id) block = get_xblock(usage_key, request_user) diff --git a/cms/djangoapps/contentstore/tasks.py b/cms/djangoapps/contentstore/tasks.py index 00b39a4bda2a..451bc9221ea0 100644 --- a/cms/djangoapps/contentstore/tasks.py +++ b/cms/djangoapps/contentstore/tasks.py @@ -9,6 +9,9 @@ import tarfile import re import requests +import aiohttp +import asyncio +import time from datetime import datetime from tempfile import NamedTemporaryFile, mkdtemp @@ -1117,20 +1120,27 @@ def validate_user(): return def get_urls(content): - """Returns all urls after href and src in content.""" - regex = r'\s+(?:href|src)=["\']([^"\']*)["\']' - urls = re.findall(regex, content) - return urls + """ + Returns all urls found after href and src in content. + Excludes urls that are only '#'. + """ + regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']' + url_list = re.findall(regex, content) + return url_list + + def is_studio_url(url): + """Returns True if url is a studio url.""" + return not url.startswith('http://') and not url.startswith('https://') def convert_to_standard_url(url, course_key): """ - Returns standard urls when given studio urls. + Returns standard urls when given studio urls. Otherwise return url as is. Example urls: /assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png /static/getting-started_x250.png /container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7 """ - if not url.startswith('http://') and not url.startswith('https://'): + if is_studio_url(url): if url.startswith('/static/'): processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1] return 'http://' + settings.CMS_BASE + processed_url @@ -1138,80 +1148,135 @@ def convert_to_standard_url(url, course_key): return 'http://' + settings.CMS_BASE + url else: return 'http://' + settings.CMS_BASE + '/container/' + url + else: + return url - def check_url(url): - """Returns SUCCESS, ACCESS_DENIED, or FAILURE after checking url request""" - try: - response = requests.get(url, timeout=5) - if response.status_code == 200: - return "SUCCESS" - elif response.status_code == 403: - return "ACCESS_DENIED" - else: - return "FAILURE" - except requests.exceptions.RequestException as e: - return "FAILURE" - - def verify_url(url): - """Returns true if url request returns 200""" - try: - response = requests.get(url, timeout=5) - return response.status_code == 200 - except requests.exceptions.RequestException as e: - return False - - def scan_course(course_key): + def scan_course_for_links(course_key): """ - Scans the course and returns broken link tuples. - [, ] + Returns a list of all urls in a course. + Returns: [ [block_id1, url1], [block_id2, url2], ... ] """ - broken_links = [] verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only) - print('verticals: ', verticals) blocks = [] + urls_to_validate = [] for vertical in verticals: blocks.extend(vertical.get_children()) for block in blocks: - usage_key = block.usage_key + block_id = str(block.usage_key) block_info = get_block_info(block) block_data = block_info['data'] - urls = get_urls(block_data) - for url in urls: - if url == '#': - break - standardized_url = convert_to_standard_url(url, course_key) - # TODO if check_url ACCESS_DENIED and is studio link, it's locked - if not verify_url(standardized_url): - broken_links.append([str(usage_key), url]) + url_list = get_urls(block_data) + urls_to_validate += [[block_id, url] for url in url_list] + + return urls_to_validate + + async def validate_url_access(session, url_data, course_key): + """ + Returns the status of a url request + Returns: {block_id1, url1, status} + """ + block_id, url = url_data + result = {'block_id': block_id, 'url': url} + standardized_url = convert_to_standard_url(url, course_key) + try: + async with session.get(standardized_url, timeout=5) as response: + result.update({'status': response.status}) + except Exception as e: + result.update({'status': None}) + LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}') + return result + + async def validate_urls_access_in_batches(url_list, course_key, batch_size=100): + """ + Returns the statuses of a list of url requests. + Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ] + """ + responses = [] + url_count = len(url_list) + + for i in range(0, url_count, batch_size): + batch = url_list[i:i + batch_size] + async with aiohttp.ClientSession() as session: + tasks = [validate_url_access(session, url_data, course_key) for url_data in batch] + batch_results = await asyncio.gather(*tasks) + responses.extend(batch_results) + LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}') + + return responses + + def retry_validation(url_list, course_key, retry_count=3): + """Retry urls that failed due to connection error.""" + results = [] + retry_list = url_list + for i in range(0, retry_count): + if retry_list: + LOGGER.debug(f'[Link Check] retry attempt #{i+1}') + validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100)) + filetered_url_list, retry_list = filter_by_status(validated_url_list) + results.extend(filetered_url_list) + + if retry_list: + LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error') + + return results - return broken_links + def filter_by_status(results): + """ + Filter results by status. + 200: OK. No need to do more + 403: Forbidden. Record as locked link. + None: Error. Retry up to 3 times. + Other: Failure. Record as broken link. + Returns: + filtered_results: [ [block_id1, url1, is_locked], ... ] + retry_list: [ [block_id1, url1], ... ] + """ + filtered_results = [] + retry_list = [] + for result in results: + if result['status'] == None: + retry_list.append([result['block_id'], result['url']]) + elif result['status'] == 200: + continue + elif result['status'] == 403 and is_studio_url(result['url']): + filtered_results.append([result['block_id'], result['url'], True]) + else: + filtered_results.append([result['block_id'], result['url'], False]) + + return filtered_results, retry_list user = validate_user() self.status.set_state('Scanning') - courselike_key = CourseKey.from_string(course_key_string) - data = scan_course(courselike_key) + course_key = CourseKey.from_string(course_key_string) + url_list = scan_course_for_links(course_key) + validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100)) + broken_or_locked_urls, retry_list = filter_by_status(validated_url_list) + + if retry_list: + retry_results = retry_validation(retry_list, course_key, retry_count=3) + broken_or_locked_urls.extend(retry_results) try: self.status.increment_completed_steps() - file_name = str(courselike_key) - links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') - LOGGER.debug('json file being generated at %s', links_file.name) + file_name = str(course_key) + broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json') + LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}') - with open(links_file.name, 'w') as file: - json.dump(data, file, indent=4) + with open(broken_links_file.name, 'w') as file: + json.dump(broken_or_locked_urls, file, indent=4) artifact = UserTaskArtifact(status=self.status, name='BrokenLinks') - artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file)) + artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file)) artifact.save() # catch all exceptions so we can record useful error messages - except Exception as exception: # pylint: disable=broad-except - LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True) + except Exception as e: # pylint: disable=broad-except + LOGGER.exception('Error checking links for course %s', course_key, exc_info=True) if self.status.state != UserTaskStatus.FAILED: - self.status.fail({'raw_error_msg': str(exception)}) + self.status.fail({'raw_error_msg': str(e)}) return