Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: locked link #35976

Merged
merged 5 commits into from
Dec 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cms/djangoapps/contentstore/core/course_optimizer_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def generate_broken_links_descriptor(json_content, request_user):
xblock_dictionary = {} # dictionary of xblock attributes

for item in json_content:
block_id, link, *is_locked = item
is_locked_flag = bool(is_locked)
block_id, link, *rest = item
is_locked_flag = bool(rest[0])

usage_key = usage_key_with_run(block_id)
block = get_xblock(usage_key, request_user)
Expand Down
171 changes: 118 additions & 53 deletions cms/djangoapps/contentstore/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
import tarfile
import re
import requests
import aiohttp
import asyncio
import time
from datetime import datetime
from tempfile import NamedTemporaryFile, mkdtemp

Expand Down Expand Up @@ -1117,101 +1120,163 @@ def validate_user():
return

def get_urls(content):
"""Returns all urls after href and src in content."""
regex = r'\s+(?:href|src)=["\']([^"\']*)["\']'
urls = re.findall(regex, content)
return urls
"""
Returns all urls found after href and src in content.
Excludes urls that are only '#'.
"""
regex = r'\s+(?:href|src)=["\'](?!#)([^"\']*)["\']'
url_list = re.findall(regex, content)
return url_list

def is_studio_url(url):
"""Returns True if url is a studio url."""
return not url.startswith('http://') and not url.startswith('https://')

def convert_to_standard_url(url, course_key):
"""
Returns standard urls when given studio urls.
Returns standard urls when given studio urls. Otherwise return url as is.
Example urls:
/assets/courseware/v1/506da5d6f866e8f0be44c5df8b6e6b2a/asset-v1:edX+DemoX+Demo_Course+type@asset+block/getting-started_x250.png
/static/getting-started_x250.png
/container/block-v1:edX+DemoX+Demo_Course+type@vertical+block@2152d4a4aadc4cb0af5256394a3d1fc7
"""
if not url.startswith('http://') and not url.startswith('https://'):
if is_studio_url(url):
if url.startswith('/static/'):
processed_url = replace_static_urls(f'\"{url}\"', course_id=course_key)[1:-1]
return 'http://' + settings.CMS_BASE + processed_url
elif url.startswith('/'):
return 'http://' + settings.CMS_BASE + url
else:
return 'http://' + settings.CMS_BASE + '/container/' + url
else:
return url

def check_url(url):
"""Returns SUCCESS, ACCESS_DENIED, or FAILURE after checking url request"""
try:
response = requests.get(url, timeout=5)
if response.status_code == 200:
return "SUCCESS"
elif response.status_code == 403:
return "ACCESS_DENIED"
else:
return "FAILURE"
except requests.exceptions.RequestException as e:
return "FAILURE"

def verify_url(url):
"""Returns true if url request returns 200"""
try:
response = requests.get(url, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException as e:
return False

def scan_course(course_key):
def scan_course_for_links(course_key):
"""
Scans the course and returns broken link tuples.
[<block_id>, <broken_link>]
Returns a list of all urls in a course.
Returns: [ [block_id1, url1], [block_id2, url2], ... ]
"""
broken_links = []
verticals = modulestore().get_items(course_key, qualifiers={'category': 'vertical'}, revision=ModuleStoreEnum.RevisionOption.published_only)
print('verticals: ', verticals)
blocks = []
urls_to_validate = []

for vertical in verticals:
blocks.extend(vertical.get_children())

for block in blocks:
usage_key = block.usage_key
block_id = str(block.usage_key)
block_info = get_block_info(block)
block_data = block_info['data']
urls = get_urls(block_data)

for url in urls:
if url == '#':
break
standardized_url = convert_to_standard_url(url, course_key)
# TODO if check_url ACCESS_DENIED and is studio link, it's locked
if not verify_url(standardized_url):
broken_links.append([str(usage_key), url])
url_list = get_urls(block_data)
urls_to_validate += [[block_id, url] for url in url_list]

return urls_to_validate

async def validate_url_access(session, url_data, course_key):
"""
Returns the status of a url request
Returns: {block_id1, url1, status}
"""
block_id, url = url_data
result = {'block_id': block_id, 'url': url}
standardized_url = convert_to_standard_url(url, course_key)
try:
async with session.get(standardized_url, timeout=5) as response:
result.update({'status': response.status})
except Exception as e:
result.update({'status': None})
LOGGER.debug(f'[Link Check] Request error when validating {url}: {str(e)}')
return result

async def validate_urls_access_in_batches(url_list, course_key, batch_size=100):
"""
Returns the statuses of a list of url requests.
Returns: [ {block_id1, url1, status}, {block_id2, url2, status}, ... ]
"""
responses = []
url_count = len(url_list)

for i in range(0, url_count, batch_size):
jesperhodge marked this conversation as resolved.
Show resolved Hide resolved
batch = url_list[i:i + batch_size]
async with aiohttp.ClientSession() as session:
tasks = [validate_url_access(session, url_data, course_key) for url_data in batch]
batch_results = await asyncio.gather(*tasks)
responses.extend(batch_results)
LOGGER.debug(f'[Link Check] request batch {i // batch_size+1} of {url_count // batch_size + 1}')

return responses

def retry_validation(url_list, course_key, retry_count=3):
"""Retry urls that failed due to connection error."""
results = []
retry_list = url_list
for i in range(0, retry_count):
if retry_list:
LOGGER.debug(f'[Link Check] retry attempt #{i+1}')
validated_url_list = asyncio.run(validate_urls_access_in_batches(retry_list, course_key, batch_size=100))
filetered_url_list, retry_list = filter_by_status(validated_url_list)
results.extend(filetered_url_list)

if retry_list:
LOGGER.debug(f'[Link Check] {len(retry_list)} requests failed due to connection error')

return results

return broken_links
def filter_by_status(results):
"""
Filter results by status.
200: OK. No need to do more
403: Forbidden. Record as locked link.
None: Error. Retry up to 3 times.
Other: Failure. Record as broken link.
Returns:
filtered_results: [ [block_id1, url1, is_locked], ... ]
retry_list: [ [block_id1, url1], ... ]
"""
filtered_results = []
retry_list = []
for result in results:
if result['status'] == None:
retry_list.append([result['block_id'], result['url']])
elif result['status'] == 200:
continue
elif result['status'] == 403 and is_studio_url(result['url']):
filtered_results.append([result['block_id'], result['url'], True])
else:
filtered_results.append([result['block_id'], result['url'], False])

return filtered_results, retry_list

user = validate_user()

self.status.set_state('Scanning')
courselike_key = CourseKey.from_string(course_key_string)
data = scan_course(courselike_key)
course_key = CourseKey.from_string(course_key_string)
url_list = scan_course_for_links(course_key)
validated_url_list = asyncio.run(validate_urls_access_in_batches(url_list, course_key, batch_size=100))
broken_or_locked_urls, retry_list = filter_by_status(validated_url_list)

if retry_list:
retry_results = retry_validation(retry_list, course_key, retry_count=3)
broken_or_locked_urls.extend(retry_results)

try:
self.status.increment_completed_steps()

file_name = str(courselike_key)
links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug('json file being generated at %s', links_file.name)
file_name = str(course_key)
broken_links_file = NamedTemporaryFile(prefix=file_name + '.', suffix='.json')
LOGGER.debug(f'[Link Check] json file being generated at {broken_links_file.name}')

with open(links_file.name, 'w') as file:
json.dump(data, file, indent=4)
with open(broken_links_file.name, 'w') as file:
json.dump(broken_or_locked_urls, file, indent=4)

artifact = UserTaskArtifact(status=self.status, name='BrokenLinks')
artifact.file.save(name=os.path.basename(links_file.name), content=File(links_file))
artifact.file.save(name=os.path.basename(broken_links_file.name), content=File(broken_links_file))
artifact.save()

# catch all exceptions so we can record useful error messages
except Exception as exception: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', courselike_key, exc_info=True)
except Exception as e: # pylint: disable=broad-except
LOGGER.exception('Error checking links for course %s', course_key, exc_info=True)
if self.status.state != UserTaskStatus.FAILED:
self.status.fail({'raw_error_msg': str(exception)})
self.status.fail({'raw_error_msg': str(e)})
return
Loading