Skip to content

Commit

Permalink
feat: locked link (#35976)
Browse files Browse the repository at this point in the history
  • Loading branch information
rayzhou-bit authored Dec 15, 2024
1 parent 6f98200 commit 3f82c62
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 55 deletions.
4 changes: 2 additions & 2 deletions cms/djangoapps/contentstore/core/course_optimizer_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def generate_broken_links_descriptor(json_content, request_user):
xblock_dictionary = {} # dictionary of xblock attributes

for item in json_content:
block_id, link, *is_locked = item
is_locked_flag = bool(is_locked)
block_id, link, *rest = item
is_locked_flag = bool(rest[0])

usage_key = usage_key_with_run(block_id)
block = get_xblock(usage_key, request_user)
Expand Down
171 changes: 118 additions & 53 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import tarfile
import re
import requests
import aiohttp
import asyncio
import time
from datetime import datetime
from tempfile import NamedTemporaryFile, mkdtemp

Expand Down Expand Up @@ -1117,101 +1120,163 @@ def validate_user():
return

def get_urls(content):
"""Returns all urls after href and src in content."""
regex = r'\s+(?:href|src)=["\']([^"\']*)["\']'
urls = re.findall(regex, content)
return urls
"""
Returns all urls found after href and src in content.
Excludes urls that are only '#'.
"""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

def is_studio_url(url):
"""Returns True if url is a studio url."""
return not url.startswith('http://') and not url.startswith('https://')

def convert_to_standard_url(url, course_key):
"""
Returns standard urls when given studio urls.
Returns standard urls when given studio urls. Otherwise return url as is.
Example urls:
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
/static/getting-started_x250.png
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
"""
if not url.startswith('http://') and not url.startswith('https://'):
if is_studio_url(url):
if url.startswith('/static/'):
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
return 'http://' + settings.CMS_BASE + processed_url
elif url.startswith('/'):
return 'http://' + settings.CMS_BASE + url
else:
return 'http://' + settings.CMS_BASE + '/container/' + url
else:
return url

def check_url(url):
"""Returns SUCCESS, ACCESS_DENIED, or FAILURE after checking url request"""
try:
response = requests.get(url, timeout=5)
if response.status_code == 200:
return "SUCCESS"
elif response.status_code == 403:
return "ACCESS_DENIED"
else:
return "FAILURE"
except requests.exceptions.RequestException as e:
return "FAILURE"

def verify_url(url):
"""Returns true if url request returns 200"""
try:
response = requests.get(url, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException as e:
return False

def scan_course(course_key):
def scan_course_for_links(course_key):
"""
Scans the course and returns broken link tuples.
[<block_id>, <broken_link>]
Returns a list of all urls in a course.
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
"""
broken_links = []
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
print('verticals: ', verticals)
blocks = []
urls_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
usage_key = block.usage_key
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']
urls = get_urls(block_data)

for url in urls:
if url == '#':
break
standardized_url = convert_to_standard_url(url, course_key)
# TODO if check_url ACCESS_DENIED and is studio link, it's locked
if not verify_url(standardized_url):
broken_links.append([str(usage_key), url])
url_list = get_urls(block_data)
urls_to_validate += [[block_id, url] for url in url_list]

return urls_to_validate

async def validate_url_access(session, url_data, course_key):
"""
Returns the status of a url request
Returns: {block_id1, url1, status}
"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = convert_to_standard_url(url, course_key)
try:
async with session.get(standardized_url, timeout=5) as response:
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
return result

async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
"""
Returns the statuses of a list of url requests.
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
"""
responses = []
url_count = len(url_list)

for i in range(0, url_count, batch_size):
batch = url_list[i:i + batch_size]
async with aiohttp.ClientSession() as session:
tasks = [validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}')

return responses

def retry_validation(url_list, course_key, retry_count=3):
"""Retry urls that failed due to connection error."""
results = []
retry_list = url_list
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] retry attempt #{i+1}')
validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100))
filetered_url_list, retry_list = filter_by_status(validated_url_list)
results.extend(filetered_url_list)

if retry_list:
LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error')

return results

return broken_links
def filter_by_status(results):
"""
Filter results by status.
200: OK. No need to do more
403: Forbidden. Record as locked link.
None: Error. Retry up to 3 times.
Other: Failure. Record as broken link.
Returns:
filtered_results: [ [block_id1, url1, is_locked], ... ]
retry_list: [ [block_id1, url1], ... ]
"""
filtered_results = []
retry_list = []
for result in results:
if result['status'] == None:
retry_list.append([result['block_id'], result['url']])
elif result['status'] == 200:
continue
elif result['status'] == 403 and is_studio_url(result['url']):
filtered_results.append([result['block_id'], result['url'], True])
else:
filtered_results.append([result['block_id'], result['url'], False])

return filtered_results, retry_list

user = validate_user()

self.status.set_state('Scanning')
courselike_key = CourseKey.from_string(course_key_string)
data = scan_course(courselike_key)
course_key = CourseKey.from_string(course_key_string)
url_list = scan_course_for_links(course_key)
validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100))
broken_or_locked_urls, retry_list = filter_by_status(validated_url_list)

if retry_list:
retry_results = retry_validation(retry_list, course_key, retry_count=3)
broken_or_locked_urls.extend(retry_results)

try:
self.status.increment_completed_steps()

file_name = str(courselike_key)
links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug('json file being generated at %s', links_file.name)
file_name = str(course_key)
broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}')

with open(links_file.name, 'w') as file:
json.dump(data, file, indent=4)
with open(broken_links_file.name, 'w') as file:
json.dump(broken_or_locked_urls, file, indent=4)

artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as exception: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True)
except Exception as e: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)
if self.status.state != UserTaskStatus.FAILED:
self.status.fail({'raw_error_msg': str(exception)})
self.status.fail({'raw_error_msg': str(e)})
return

0 comments on commit 3f82c62

Please sign in to comment.