Skip to content

Commit

Permalink
feat: retry urls with connection errors
Browse files Browse the repository at this point in the history
  • Loading branch information
rayzhou-bit committed Dec 12, 2024
1 parent 0cfa088 commit 57040df
Showing 1 changed file with 80 additions and 47 deletions.
127 changes: 80 additions & 47 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,8 +1120,10 @@ def validate_user():
return

def get_urls(content):
"""Returns all urls foundafter href and src in content.
Excludes urls that are only '#'."""
"""
Returns all urls found after href and src in content.
Excludes urls that are only '#'.
"""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list
Expand Down Expand Up @@ -1149,9 +1151,33 @@ def convert_to_standard_url(url, course_key):
else:
return url

def scan_course_for_links(course_key):
"""
Returns a list of all urls in a course.
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
"""
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
blocks = []
urls_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']

url_list = get_urls(block_data)
urls_to_validate += [[block_id, url] for url in url_list]

return urls_to_validate

async def validate_url_access(session, url_data, course_key):
"""Returns status of a url request.
url_list is [id, url]"""
"""
Returns the status of a url request
Returns: {block_id1, url1, status}
"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = convert_to_standard_url(url, course_key)
Expand All @@ -1160,12 +1186,14 @@ async def validate_url_access(session, url_data, course_key):
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
print('[Validate url error]', type(e), e, url)
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
return result

async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
"""Returns the statuses of a list of url requests.
url_list is [block_id, url]"""
"""
Returns the statuses of a list of url requests.
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
"""
responses = []
url_count = len(url_list)

Expand All @@ -1175,67 +1203,72 @@ async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
tasks = [validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
print(f'batch {i // batch_size+1} of {url_count // batch_size + 1}')
LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}')

return responses

def scan_course_for_links(course_key):
def filter_by_status(results):
"""
Returns a list of links that are broken or locked.
[block_id, link, is_locked]
Filter results by status.
200: OK. No need to do more
403: Forbidden. Record as locked link.
None: Error. Retry up to 3 times.
Other: Failure. Record as broken link.
Returns:
filtered_results: [ [block_id1, url1, is_locked], ... ]
retry_list: [ [block_id1, url1], ... ]
"""
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
blocks = []
links_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']

url_list = get_urls(block_data)
links_to_validate += [[block_id, url] for url in url_list]

return links_to_validate
filtered_results = []
retry_list = []
for result in results:
if result['status'] == None:
retry_list.append([result['block_id'], result['url']])
elif result['status'] == 200:
continue
elif result['status'] == 403 and is_studio_url(result['url']):
filtered_results.append([result['block_id'], result['url'], True])
else:
filtered_results.append([result['block_id'], result['url'], False])

return filtered_results, retry_list

user = validate_user()

self.status.set_state('Scanning')
course_key = CourseKey.from_string(course_key_string)
links_list = scan_course_for_links(course_key)
results = asyncio.run(validate_urls_access_in_batches(links_list, course_key, batch_size=100))

final_results = []
for result in results:
if result['status'] == None: # Request error
print('retry') # TODO retry
if result['status'] == 200: # OK
print('remove from list') # TODO remove
elif result['status'] == 403 and is_studio_url(result['url']):
final_results.append([result['block_id'], result['url'], True])
else:
final_results.append([result['block_id'], result['url'], False])
url_list = scan_course_for_links(course_key)
validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100))
broken_or_locked_urls, retry_list = filter_by_status(validated_url_list)

# Retry urls that failed due to connection error
retry_count = 3
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] retry attempt #{i+1}')
retry_validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100))
retry_results, retry_list = filter_by_status(retry_validated_url_list)
broken_or_locked_urls.extend(retry_results)

if retry_list:
LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error')

try:
self.status.increment_completed_steps()

file_name = str(course_key)
links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug('json file being generated at %s', links_file.name)
broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}')

with open(links_file.name, 'w') as file:
json.dump(final_results, file, indent=4)
with open(broken_links_file.name, 'w') as file:
json.dump(broken_or_locked_urls, file, indent=4)

artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as exception: # pylint: disable=broad-except
except Exception as e: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)
if self.status.state != UserTaskStatus.FAILED:
self.status.fail({'raw_error_msg': str(exception)})
self.status.fail({'raw_error_msg': str(e)})
return

0 comments on commit 57040df

Please sign in to comment.