Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Source specifying for relative links is allowed #232

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ The link map file can be supplied using `-f` or `--link_file`::

cc2olx -r zip -i <IMSCC_FILE> -f <CSV_FILE>

If the original course content contains relative links and the resources
(images, documents etc) the links point to are not included into the exported
course dump, you can specify their source using `-s` flag:

cc2olx -i <IMSCC_FILE> -s <RELATIVE_LINKS_SOURCE>

Test Data
---------
Expand Down
9 changes: 9 additions & 0 deletions src/cc2olx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from pathlib import Path

from cc2olx.validators.cli import LinkSourceValidator

RESULT_TYPE_FOLDER = "folder"
RESULT_TYPE_ZIP = "zip"

Expand Down Expand Up @@ -69,4 +71,11 @@ def parse_args(args=None):
"should contain consumer_id, consumer_key and consumer_secret."
),
)
parser.add_argument(
"-s",
"--relative_links_source",
nargs="?",
type=LinkSourceValidator(),
help="The relative links source in the format '<scheme>://<netloc>', e.g. 'https://example.com'",
)
return parser.parse_args(args)
2 changes: 2 additions & 0 deletions src/cc2olx/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OLX_STATIC_DIR = "static"
OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
23 changes: 19 additions & 4 deletions src/cc2olx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,25 @@
from cc2olx import filesystem
from cc2olx import olx
from cc2olx.cli import parse_args, RESULT_TYPE_FOLDER, RESULT_TYPE_ZIP
from cc2olx.models import Cartridge, OLX_STATIC_DIR
from cc2olx.constants import OLX_STATIC_DIR
from cc2olx.models import Cartridge
from cc2olx.settings import collect_settings


def convert_one_file(input_file, workspace, link_file=None, passport_file=None):
def convert_one_file(
input_file,
workspace,
link_file=None,
passport_file=None,
relative_links_source=None,
):
filesystem.create_directory(workspace)

cartridge = Cartridge(input_file, workspace)
cartridge.load_manifest_extracted()
cartridge.normalize()

olx_export = olx.OlxExport(cartridge, link_file, passport_file)
olx_export = olx.OlxExport(cartridge, link_file, passport_file, relative_links_source)
olx_filename = cartridge.directory.parent / (cartridge.directory.name + "-course.xml")
policy_filename = cartridge.directory.parent / "policy.json"

Expand Down Expand Up @@ -53,6 +60,7 @@ def main():
workspace = settings["workspace"]
link_file = settings["link_file"]
passport_file = settings["passport_file"]
relative_links_source = settings["relative_links_source"]

# setup logger
logging_config = settings["logging_config"]
Expand All @@ -64,7 +72,14 @@ def main():

for input_file in settings["input_files"]:
try:
convert_one_file(input_file, temp_workspace, link_file, passport_file)
convert_one_file(
input_file,
temp_workspace,
link_file,
passport_file,
relative_links_source,
)

except Exception:
logger.exception("Error while converting %s file", input_file)

Expand Down
21 changes: 3 additions & 18 deletions src/cc2olx/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import zipfile

from cc2olx import filesystem
from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
from cc2olx.external.canvas import ModuleMeta
from cc2olx.qti import QtiParser
from cc2olx.utils import clean_file_name
Expand All @@ -24,22 +25,6 @@
DIFFUSE_SHALLOW_SECTIONS = False
DIFFUSE_SHALLOW_SUBSECTIONS = True

OLX_STATIC_DIR = "static"

OLX_DIRECTORIES = [
"about",
"assets",
"chapter",
"course",
"html",
"info",
"policies",
"problem",
"sequential",
OLX_STATIC_DIR,
"vertical",
]


def is_leaf(container):
return "identifierref" in container
Expand Down Expand Up @@ -343,7 +328,7 @@ def get_resource_content(self, identifier):
return "html", {"html": html}
elif "web_resources" in str(res_filename) and imghdr.what(str(res_filename)):
static_filename = str(res_filename).split("web_resources/")[1]
olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, static_filename)
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
html = (
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
'</head><body><p><img src="{}" alt="{}"></p></body></html>'.format(olx_static_path, static_filename)
Expand All @@ -353,7 +338,7 @@ def get_resource_content(self, identifier):
# This webcontent is outside of ``web_resources`` directory
# So we need to manually copy it to OLX_STATIC_DIR
self.extra_static_files.append(res_relative_path)
olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, res_relative_path)
olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
html = (
'<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
'</head><body><p><a href="{}" alt="{}">{}<a></p></body></html>'.format(
Expand Down
27 changes: 26 additions & 1 deletion src/cc2olx/olx.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from lxml import html
from cc2olx.iframe_link_parser import KalturaIframeLinkParser

from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
from cc2olx.qti import QtiExport
from cc2olx.utils import element_builder, passport_file_parser

Expand Down Expand Up @@ -36,11 +37,12 @@ class OlxExport:
QTI = "qti"
DISCUSSION = "discussion"

def __init__(self, cartridge, link_file=None, passport_file=None):
def __init__(self, cartridge, link_file=None, passport_file=None, relative_links_source=None):
self.cartridge = cartridge
self.doc = None
self.link_file = link_file
self.passport_file = passport_file
self.relative_links_source = relative_links_source
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add some validation for the format of relative_links_source.

Copy link
Contributor Author

@myhailo-chernyshov-rg myhailo-chernyshov-rg Dec 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Validation is added on the argument parsing step.

self.iframe_link_parser = None
if link_file:
self.iframe_link_parser = KalturaIframeLinkParser(self.link_file)
Expand Down Expand Up @@ -250,6 +252,27 @@ def process_external_tools_link(item, html):
html = html.replace(item, external_tool_url)
return html

def process_extra_static_files(item, html):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a docstring here explaining what this function is for.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"""
Turn static file URLs outside OLX_STATIC_DIR into absolute URLs.

Allow to avoid a situation when the original course page links have
relative URLs, such URLs weren't included into the exported Common
Cartridge course file that causes broken URLs in the imported OeX
course. The function adds the origin source to URLs to make them
absolute ones.
"""
if self.relative_links_source is None:
return html

for static_file in self.cartridge.extra_static_files:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How large can extra_static_files get? This looks like it could get pretty expensive to loop through every entity in this list for each item.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

During the resource content collection, when the external file is found, it's added to this list. So, the list size isn't limited and depends on CC archive content. But we need to process each such link to make it absolute, so we do need to iterate over the list. I don't think it's too expensive because other exporter tasks require much more actions to perform. Do you have any ideas how we can handle all the links without iterating over them?

if item == OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_file):
return html

url = urllib.parse.urljoin(self.relative_links_source, item)
html = html.replace(item, url)
return html

for _, item in items:
if "IMS-CC-FILEBASE" in item:
html = process_ims_cc_filebase(item, html)
Expand All @@ -259,6 +282,8 @@ def process_external_tools_link(item, html):
html = process_external_tools_link(item, html)
elif "CANVAS_OBJECT_REFERENCE" in item:
html = process_canvas_reference(item, html)
else:
html = process_extra_static_files(item, html)

return html

Expand Down
1 change: 1 addition & 0 deletions src/cc2olx/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ def collect_settings(parsed_args):
"workspace": Path.cwd() / parsed_args.output,
"link_file": parsed_args.link_file,
"passport_file": parsed_args.passport_file,
"relative_links_source": parsed_args.relative_links_source,
}
return settings
16 changes: 14 additions & 2 deletions src/cc2olx/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
""" Utility functions for cc2olx"""

import logging
import string
import csv
import ipaddress
import logging
import re
import string

logger = logging.getLogger()

Expand Down Expand Up @@ -108,3 +109,14 @@ def clean_file_name(filename: str):

cleaned_name = re.sub(special_characters, "_", filename)
return cleaned_name


def is_valid_ipv6_address(value: str) -> bool:
"""
Return whether the value is a valid IPv6 address.
"""
try:
ipaddress.IPv6Address(value)
except ValueError:
return False
return True
Empty file.
62 changes: 62 additions & 0 deletions src/cc2olx/validators/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import argparse
import re

from cc2olx.utils import is_valid_ipv6_address


class LinkSourceValidator:
"""
Validate a link source.
"""

UL = "\u00a1-\uffff" # Unicode letters range (must not be a raw string).

# IP patterns
IPV4_REGEX = (
r"(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)"
r"(?:\.(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)){3}"
)
IPV6_REGEX = r"\[[0-9a-f:.]+\]" # (simple regex, validated later)

# Host patterns
HOSTNAME_REGEX = rf"[a-z{UL}0-9](?:[a-z{UL}0-9-]{{0,61}}[a-z{UL}0-9])?"
# Max length for domain name labels is 63 characters per RFC 1034 sec. 3.1
DOMAIN_REGEX = rf"(?:\.(?!-)[a-z{UL}0-9-]{{1,63}}(?<!-))*"
TLD_REGEX = (
r"\." # dot
r"(?!-)" # can't start with a dash
rf"(?:[a-z{UL}-]{{2,63}}" # domain label
r"|xn--[a-z0-9]{1,59})" # or punycode label
r"(?<!-)" # can't end with a dash
r"\.?" # may have a trailing dot
)
HOST_REGEX = "(" + HOSTNAME_REGEX + DOMAIN_REGEX + TLD_REGEX + "|localhost)"

LINK_SOURCE_REGEX = (
r"^https?://" # scheme is validated separately
r"(?:[^\s:@/]+(?::[^\s:@/]*)?@)?" # user:pass authentication
rf"(?P<netloc>{IPV4_REGEX}|{IPV6_REGEX}|{HOST_REGEX})"
r"(?::[0-9]{1,5})?" # port
r"/?" # trailing slash
r"\Z"
)

message = "Enter a valid URL."

def __call__(self, value: str) -> str:
if not (link_source_match := re.fullmatch(self.LINK_SOURCE_REGEX, value, re.IGNORECASE)):
raise argparse.ArgumentTypeError(self.message)

self._validate_ipv6_address(link_source_match.group("netloc"))

return value

def _validate_ipv6_address(self, netloc: str) -> None:
"""
Check netloc correctness if it's an IPv6 address.
"""
potential_ipv6_regex = r"^\[(.+)\](?::[0-9]{1,5})?$"
if netloc_match := re.search(potential_ipv6_regex, netloc):
potential_ip = netloc_match[1]
if not is_valid_ipv6_address(potential_ip):
raise argparse.ArgumentTypeError(self.message)
42 changes: 41 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from argparse import Namespace
from pathlib import Path

import pytest

from cc2olx.cli import parse_args

Expand All @@ -16,7 +19,13 @@ def test_parse_args(imscc_file):
)

assert parsed_args == Namespace(
inputs=[imscc_file], loglevel="INFO", result="folder", link_file=None, passport_file=None, output="output"
inputs=[imscc_file],
loglevel="INFO",
result="folder",
link_file=None,
passport_file=None,
output="output",
relative_links_source=None,
)


Expand All @@ -34,6 +43,7 @@ def test_parse_args_csv_file(imscc_file, link_map_csv):
link_file=link_map_csv,
passport_file=None,
output="output",
relative_links_source=None,
)


Expand All @@ -49,4 +59,34 @@ def test_parse_args_passport_file(imscc_file, passports_csv):
link_file=None,
passport_file=passports_csv,
output="output",
relative_links_source=None,
)


def test_parse_args_with_correct_relative_links_source(imscc_file: Path) -> None:
"""
Positive input test for relative links source argument.
"""
relative_links_source = "https://example.com"

parsed_args = parse_args(["-i", str(imscc_file), "-s", relative_links_source])

assert parsed_args == Namespace(
inputs=[imscc_file],
loglevel="INFO",
result="folder",
link_file=None,
passport_file=None,
output="output",
relative_links_source=relative_links_source,
)


def test_parse_args_with_incorrect_relative_links_source(imscc_file: Path) -> None:
"""
Test arguments parser detects incorrect relative links sources.
"""
relative_links_source = "ws://example.com"

with pytest.raises(SystemExit):
parse_args(["-i", str(imscc_file), "-s", relative_links_source])
1 change: 1 addition & 0 deletions tests/test_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ def test_collect_settings(imscc_file):
"level": parsed_args.loglevel,
"format": "{%(filename)s:%(lineno)d} - %(message)s",
},
"relative_links_source": None,
}
Empty file.
Loading
Loading