-
Notifications
You must be signed in to change notification settings - Fork 15
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Source specifying for relative links is allowed #232
base: master
Are you sure you want to change the base?
Changes from all commits
34aed04
c173010
11741eb
2bb8cb4
e7bd6c4
ff473b9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
OLX_STATIC_DIR = "static" | ||
OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from lxml import html | ||
from cc2olx.iframe_link_parser import KalturaIframeLinkParser | ||
|
||
from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE | ||
from cc2olx.qti import QtiExport | ||
from cc2olx.utils import element_builder, passport_file_parser | ||
|
||
|
@@ -36,11 +37,12 @@ class OlxExport: | |
QTI = "qti" | ||
DISCUSSION = "discussion" | ||
|
||
def __init__(self, cartridge, link_file=None, passport_file=None): | ||
def __init__(self, cartridge, link_file=None, passport_file=None, relative_links_source=None): | ||
self.cartridge = cartridge | ||
self.doc = None | ||
self.link_file = link_file | ||
self.passport_file = passport_file | ||
self.relative_links_source = relative_links_source | ||
self.iframe_link_parser = None | ||
if link_file: | ||
self.iframe_link_parser = KalturaIframeLinkParser(self.link_file) | ||
|
@@ -250,6 +252,27 @@ def process_external_tools_link(item, html): | |
html = html.replace(item, external_tool_url) | ||
return html | ||
|
||
def process_extra_static_files(item, html): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a docstring here explaining what this function is for. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
""" | ||
Turn static file URLs outside OLX_STATIC_DIR into absolute URLs. | ||
|
||
Allow to avoid a situation when the original course page links have | ||
relative URLs, such URLs weren't included into the exported Common | ||
Cartridge course file that causes broken URLs in the imported OeX | ||
course. The function adds the origin source to URLs to make them | ||
absolute ones. | ||
""" | ||
if self.relative_links_source is None: | ||
return html | ||
|
||
for static_file in self.cartridge.extra_static_files: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How large can There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. During the resource content collection, when the external file is found, it's added to this list. So, the list size isn't limited and depends on CC archive content. But we need to process each such link to make it absolute, so we do need to iterate over the list. I don't think it's too expensive because other exporter tasks require much more actions to perform. Do you have any ideas how we can handle all the links without iterating over them? |
||
if item == OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_file): | ||
return html | ||
|
||
url = urllib.parse.urljoin(self.relative_links_source, item) | ||
html = html.replace(item, url) | ||
return html | ||
|
||
for _, item in items: | ||
if "IMS-CC-FILEBASE" in item: | ||
html = process_ims_cc_filebase(item, html) | ||
|
@@ -259,6 +282,8 @@ def process_external_tools_link(item, html): | |
html = process_external_tools_link(item, html) | ||
elif "CANVAS_OBJECT_REFERENCE" in item: | ||
html = process_canvas_reference(item, html) | ||
else: | ||
html = process_extra_static_files(item, html) | ||
|
||
return html | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import argparse | ||
import re | ||
|
||
from cc2olx.utils import is_valid_ipv6_address | ||
|
||
|
||
class LinkSourceValidator: | ||
""" | ||
Validate a link source. | ||
""" | ||
|
||
UL = "\u00a1-\uffff" # Unicode letters range (must not be a raw string). | ||
|
||
# IP patterns | ||
IPV4_REGEX = ( | ||
r"(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)" | ||
r"(?:\.(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)){3}" | ||
) | ||
IPV6_REGEX = r"\[[0-9a-f:.]+\]" # (simple regex, validated later) | ||
|
||
# Host patterns | ||
HOSTNAME_REGEX = rf"[a-z{UL}0-9](?:[a-z{UL}0-9-]{{0,61}}[a-z{UL}0-9])?" | ||
# Max length for domain name labels is 63 characters per RFC 1034 sec. 3.1 | ||
DOMAIN_REGEX = rf"(?:\.(?!-)[a-z{UL}0-9-]{{1,63}}(?<!-))*" | ||
TLD_REGEX = ( | ||
r"\." # dot | ||
r"(?!-)" # can't start with a dash | ||
rf"(?:[a-z{UL}-]{{2,63}}" # domain label | ||
r"|xn--[a-z0-9]{1,59})" # or punycode label | ||
r"(?<!-)" # can't end with a dash | ||
r"\.?" # may have a trailing dot | ||
) | ||
HOST_REGEX = "(" + HOSTNAME_REGEX + DOMAIN_REGEX + TLD_REGEX + "|localhost)" | ||
|
||
LINK_SOURCE_REGEX = ( | ||
r"^https?://" # scheme is validated separately | ||
r"(?:[^\s:@/]+(?::[^\s:@/]*)?@)?" # user:pass authentication | ||
rf"(?P<netloc>{IPV4_REGEX}|{IPV6_REGEX}|{HOST_REGEX})" | ||
r"(?::[0-9]{1,5})?" # port | ||
r"/?" # trailing slash | ||
r"\Z" | ||
) | ||
|
||
message = "Enter a valid URL." | ||
|
||
def __call__(self, value: str) -> str: | ||
if not (link_source_match := re.fullmatch(self.LINK_SOURCE_REGEX, value, re.IGNORECASE)): | ||
raise argparse.ArgumentTypeError(self.message) | ||
|
||
self._validate_ipv6_address(link_source_match.group("netloc")) | ||
|
||
return value | ||
|
||
def _validate_ipv6_address(self, netloc: str) -> None: | ||
""" | ||
Check netloc correctness if it's an IPv6 address. | ||
""" | ||
potential_ipv6_regex = r"^\[(.+)\](?::[0-9]{1,5})?$" | ||
if netloc_match := re.search(potential_ipv6_regex, netloc): | ||
potential_ip = netloc_match[1] | ||
if not is_valid_ipv6_address(potential_ip): | ||
raise argparse.ArgumentTypeError(self.message) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add some validation for the format of
relative_links_source
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Validation is added on the argument parsing step.