Skip to content

Commit

Permalink
feat: async request calls
Browse files Browse the repository at this point in the history
  • Loading branch information
rayzhou-bit committed Dec 11, 2024
1 parent d4b2731 commit 37854c1
Showing 1 changed file with 50 additions and 42 deletions.
92 changes: 50 additions & 42 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import tarfile
import re
import requests
import aiohttp
import asyncio
import time
from datetime import datetime
from tempfile import NamedTemporaryFile, mkdtemp

Expand Down Expand Up @@ -1107,13 +1110,6 @@ def check_broken_links(self, user_id, course_key_string, language):
"""
Checks for broken links in a course. Store the results in a file.
"""
URL_STATUS = {
'success': '200 OK',
'forbidden': '403 Forbidden',
'failure': 'Request Failed',
'error': 'Request Error'
}

def validate_user():
"""Validate if the user exists. Otherwise log error. """
try:
Expand All @@ -1124,10 +1120,11 @@ def validate_user():
return

def get_urls(content):
"""Returns all urls after href and src in content."""
regex = r'\s+(?:href|src)=["\']([^"\']*)["\']'
urls = re.findall(regex, content)
return urls
"""Returns all urls foundafter href and src in content.
Excludes urls that are only '#'."""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

def is_studio_url(url):
"""Returns True if url is a studio url."""
Expand All @@ -1152,74 +1149,85 @@ def convert_to_standard_url(url, course_key):
else:
return url

def validate_url_access(url):
"""Returns status of a url request."""
async def validate_url_access(session, url_data, course_key):
"""Returns status of a url request.
url_list is [id, url]"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = convert_to_standard_url(url, course_key)
try:
response = requests.get(url, timeout=5)
if response.status_code == 200:
return URL_STATUS['success']
elif response.status_code == 403:
return URL_STATUS['forbidden']
else:
return URL_STATUS['failure']
except requests.exceptions.RequestException as e:
return URL_STATUS['error']
async with session.get(standardized_url, timeout=5) as response:
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
print('error', type(e), e, url)
return result

async def validate_urls_access(url_list, course_key):
"""Returns the statuses of a list of url requests.
url_list is [block_id, url]"""
async with aiohttp.ClientSession() as session:
tasks = [validate_url_access(session, url_data, course_key) for url_data in url_list]
responses = await asyncio.gather(*tasks)
return responses

def scan_course_for_links(course_key):
"""
Returns a list of links that are broken or locked.
[block_id, link, is_locked]
"""
links = []
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
blocks = []
links_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
usage_key = block.usage_key
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']
urls = get_urls(block_data)

for url in urls:
if url == '#': # do not evaluate these 'url'
break

standardized_url = convert_to_standard_url(url, course_key)
status = validate_url_access(standardized_url)
url_list = get_urls(block_data)
links_to_validate += [[block_id, url] for url in url_list]

if status == URL_STATUS['failure']:
links.append([str(usage_key), url, False])
if status == URL_STATUS['forbidden'] and is_studio_url(url):
links.append([str(usage_key), url, True])

return links
return links_to_validate

user = validate_user()

self.status.set_state('Scanning')
courselike_key = CourseKey.from_string(course_key_string)
data = scan_course_for_links(courselike_key)
course_key = CourseKey.from_string(course_key_string)
links_list = scan_course_for_links(course_key)
results = asyncio.run(validate_urls_access(links_list, course_key))

final_results = []
for result in results:
if result['status'] == None: # Request error
print('retry') # TODO retry
if result['status'] == 200: # OK
print('remove from list') # TODO remove
elif result['status'] == 403 and is_studio_url(result['url']):
final_results.append([result['block_id'], result['url'], True])
else:
final_results.append([result['block_id'], result['url'], False])

try:
self.status.increment_completed_steps()

file_name = str(courselike_key)
file_name = str(course_key)
links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug('json file being generated at %s', links_file.name)

with open(links_file.name, 'w') as file:
json.dump(data, file, indent=4)
json.dump(final_results, file, indent=4)

artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as exception: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True)
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)
if self.status.state != UserTaskStatus.FAILED:
self.status.fail({'raw_error_msg': str(exception)})
return

0 comments on commit 37854c1

Please sign in to comment.