openedx · myhailo-chernyshov-rg · Sep 13, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/README.rst b/README.rst
@@ -58,6 +58,11 @@ The link map file can be supplied using `-f` or `--link_file`::
 
     cc2olx -r zip -i <IMSCC_FILE> -f <CSV_FILE>
 
+If the original course content contains relative links and the resources
+(images, documents etc) the links point to are not included into the exported
+course dump, you can specify their source using `-s` flag:
+
+    cc2olx -i <IMSCC_FILE> -s <RELATIVE_LINKS_SOURCE>
 
 Test Data
 ---------

diff --git a/src/cc2olx/cli.py b/src/cc2olx/cli.py
@@ -2,6 +2,8 @@
 
 from pathlib import Path
 
+from cc2olx.validators.cli import LinkSourceValidator
+
 RESULT_TYPE_FOLDER = "folder"
 RESULT_TYPE_ZIP = "zip"
 
@@ -69,4 +71,11 @@ def parse_args(args=None):
             "should contain consumer_id, consumer_key and consumer_secret."
         ),
     )
+    parser.add_argument(
+        "-s",
+        "--relative_links_source",
+        nargs="?",
+        type=LinkSourceValidator(),
+        help="The relative links source in the format '<scheme>://<netloc>', e.g. 'https://example.com'",
+    )
     return parser.parse_args(args)
diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -0,0 +1,2 @@
+OLX_STATIC_DIR = "static"
+OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"
diff --git a/src/cc2olx/main.py b/src/cc2olx/main.py
@@ -8,18 +8,25 @@
 from cc2olx import filesystem
 from cc2olx import olx
 from cc2olx.cli import parse_args, RESULT_TYPE_FOLDER, RESULT_TYPE_ZIP
-from cc2olx.models import Cartridge, OLX_STATIC_DIR
+from cc2olx.constants import OLX_STATIC_DIR
+from cc2olx.models import Cartridge
 from cc2olx.settings import collect_settings
 
 
-def convert_one_file(input_file, workspace, link_file=None, passport_file=None):
+def convert_one_file(
+    input_file,
+    workspace,
+    link_file=None,
+    passport_file=None,
+    relative_links_source=None,
+):
     filesystem.create_directory(workspace)
 
     cartridge = Cartridge(input_file, workspace)
     cartridge.load_manifest_extracted()
     cartridge.normalize()
 
-    olx_export = olx.OlxExport(cartridge, link_file, passport_file)
+    olx_export = olx.OlxExport(cartridge, link_file, passport_file, relative_links_source)
     olx_filename = cartridge.directory.parent / (cartridge.directory.name + "-course.xml")
     policy_filename = cartridge.directory.parent / "policy.json"
 
@@ -53,6 +60,7 @@ def main():
     workspace = settings["workspace"]
     link_file = settings["link_file"]
     passport_file = settings["passport_file"]
+    relative_links_source = settings["relative_links_source"]
 
     # setup logger
     logging_config = settings["logging_config"]
@@ -64,7 +72,14 @@ def main():
 
         for input_file in settings["input_files"]:
             try:
-                convert_one_file(input_file, temp_workspace, link_file, passport_file)
+                convert_one_file(
+                    input_file,
+                    temp_workspace,
+                    link_file,
+                    passport_file,
+                    relative_links_source,
+                )
+
             except Exception:
                 logger.exception("Error while converting %s file", input_file)
 

diff --git a/src/cc2olx/models.py b/src/cc2olx/models.py
@@ -6,6 +6,7 @@
 import zipfile
 
 from cc2olx import filesystem
+from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
 from cc2olx.external.canvas import ModuleMeta
 from cc2olx.qti import QtiParser
 from cc2olx.utils import clean_file_name
@@ -24,22 +25,6 @@
 DIFFUSE_SHALLOW_SECTIONS = False
 DIFFUSE_SHALLOW_SUBSECTIONS = True
 
-OLX_STATIC_DIR = "static"
-
-OLX_DIRECTORIES = [
-    "about",
-    "assets",
-    "chapter",
-    "course",
-    "html",
-    "info",
-    "policies",
-    "problem",
-    "sequential",
-    OLX_STATIC_DIR,
-    "vertical",
-]
-
 
 def is_leaf(container):
     return "identifierref" in container
@@ -343,7 +328,7 @@ def get_resource_content(self, identifier):
                 return "html", {"html": html}
             elif "web_resources" in str(res_filename) and imghdr.what(str(res_filename)):
                 static_filename = str(res_filename).split("web_resources/")[1]
-                olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, static_filename)
+                olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_filename)
                 html = (
                     '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
                     '</head><body><p><img src="{}" alt="{}"></p></body></html>'.format(olx_static_path, static_filename)
@@ -353,7 +338,7 @@ def get_resource_content(self, identifier):
                 # This webcontent is outside of ``web_resources`` directory
                 # So we need to manually copy it to OLX_STATIC_DIR
                 self.extra_static_files.append(res_relative_path)
-                olx_static_path = "/{}/{}".format(OLX_STATIC_DIR, res_relative_path)
+                olx_static_path = OLX_STATIC_PATH_TEMPLATE.format(static_filename=res_relative_path)
                 html = (
                     '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>'
                     '</head><body><p><a href="{}" alt="{}">{}<a></p></body></html>'.format(

diff --git a/src/cc2olx/olx.py b/src/cc2olx/olx.py
@@ -7,6 +7,7 @@
 from lxml import html
 from cc2olx.iframe_link_parser import KalturaIframeLinkParser
 
+from cc2olx.constants import OLX_STATIC_PATH_TEMPLATE
 from cc2olx.qti import QtiExport
 from cc2olx.utils import element_builder, passport_file_parser
 
@@ -36,11 +37,12 @@ class OlxExport:
     QTI = "qti"
     DISCUSSION = "discussion"
 
-    def __init__(self, cartridge, link_file=None, passport_file=None):
+    def __init__(self, cartridge, link_file=None, passport_file=None, relative_links_source=None):
         self.cartridge = cartridge
         self.doc = None
         self.link_file = link_file
         self.passport_file = passport_file
+        self.relative_links_source = relative_links_source
         self.iframe_link_parser = None
         if link_file:
             self.iframe_link_parser = KalturaIframeLinkParser(self.link_file)
@@ -250,6 +252,27 @@ def process_external_tools_link(item, html):
             html = html.replace(item, external_tool_url)
             return html
 
+        def process_extra_static_files(item, html):
+            """
+            Turn static file URLs outside OLX_STATIC_DIR into absolute URLs.
+
+            Allow to avoid a situation when the original course page links have
+            relative URLs, such URLs weren't included into the exported Common
+            Cartridge course file that causes broken URLs in the imported OeX
+            course. The function adds the origin source to URLs to make them
+            absolute ones.
+            """
+            if self.relative_links_source is None:
+                return html
+
+            for static_file in self.cartridge.extra_static_files:
+                if item == OLX_STATIC_PATH_TEMPLATE.format(static_filename=static_file):
+                    return html
+
+            url = urllib.parse.urljoin(self.relative_links_source, item)
+            html = html.replace(item, url)
+            return html
+
         for _, item in items:
             if "IMS-CC-FILEBASE" in item:
                 html = process_ims_cc_filebase(item, html)
@@ -259,6 +282,8 @@ def process_external_tools_link(item, html):
                 html = process_external_tools_link(item, html)
             elif "CANVAS_OBJECT_REFERENCE" in item:
                 html = process_canvas_reference(item, html)
+            else:
+                html = process_extra_static_files(item, html)
 
         return html
 

diff --git a/src/cc2olx/settings.py b/src/cc2olx/settings.py
@@ -47,5 +47,6 @@ def collect_settings(parsed_args):
         "workspace": Path.cwd() / parsed_args.output,
         "link_file": parsed_args.link_file,
         "passport_file": parsed_args.passport_file,
+        "relative_links_source": parsed_args.relative_links_source,
     }
     return settings
diff --git a/src/cc2olx/utils.py b/src/cc2olx/utils.py
@@ -1,9 +1,10 @@
 """ Utility functions for cc2olx"""
 
-import logging
-import string
 import csv
+import ipaddress
+import logging
 import re
+import string
 
 logger = logging.getLogger()
 
@@ -108,3 +109,14 @@ def clean_file_name(filename: str):
 
     cleaned_name = re.sub(special_characters, "_", filename)
     return cleaned_name
+
+
+def is_valid_ipv6_address(value: str) -> bool:
+    """
+    Return whether the value is a valid IPv6 address.
+    """
+    try:
+        ipaddress.IPv6Address(value)
+    except ValueError:
+        return False
+    return True
diff --git a/src/cc2olx/validators/__init__.py b/src/cc2olx/validators/__init__.py
diff --git a/src/cc2olx/validators/cli.py b/src/cc2olx/validators/cli.py
@@ -0,0 +1,62 @@
+import argparse
+import re
+
+from cc2olx.utils import is_valid_ipv6_address
+
+
+class LinkSourceValidator:
+    """
+    Validate a link source.
+    """
+
+    UL = "\u00a1-\uffff"  # Unicode letters range (must not be a raw string).
+
+    # IP patterns
+    IPV4_REGEX = (
+        r"(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)"
+        r"(?:\.(?:0|25[0-5]|2[0-4][0-9]|1[0-9]?[0-9]?|[1-9][0-9]?)){3}"
+    )
+    IPV6_REGEX = r"\[[0-9a-f:.]+\]"  # (simple regex, validated later)
+
+    # Host patterns
+    HOSTNAME_REGEX = rf"[a-z{UL}0-9](?:[a-z{UL}0-9-]{{0,61}}[a-z{UL}0-9])?"
+    # Max length for domain name labels is 63 characters per RFC 1034 sec. 3.1
+    DOMAIN_REGEX = rf"(?:\.(?!-)[a-z{UL}0-9-]{{1,63}}(?<!-))*"
+    TLD_REGEX = (
+        r"\."  # dot
+        r"(?!-)"  # can't start with a dash
+        rf"(?:[a-z{UL}-]{{2,63}}"  # domain label
+        r"|xn--[a-z0-9]{1,59})"  # or punycode label
+        r"(?<!-)"  # can't end with a dash
+        r"\.?"  # may have a trailing dot
+    )
+    HOST_REGEX = "(" + HOSTNAME_REGEX + DOMAIN_REGEX + TLD_REGEX + "|localhost)"
+
+    LINK_SOURCE_REGEX = (
+        r"^https?://"  # scheme is validated separately
+        r"(?:[^\s:@/]+(?::[^\s:@/]*)?@)?"  # user:pass authentication
+        rf"(?P<netloc>{IPV4_REGEX}|{IPV6_REGEX}|{HOST_REGEX})"
+        r"(?::[0-9]{1,5})?"  # port
+        r"/?"  # trailing slash
+        r"\Z"
+    )
+
+    message = "Enter a valid URL."
+
+    def __call__(self, value: str) -> str:
+        if not (link_source_match := re.fullmatch(self.LINK_SOURCE_REGEX, value, re.IGNORECASE)):
+            raise argparse.ArgumentTypeError(self.message)
+
+        self._validate_ipv6_address(link_source_match.group("netloc"))
+
+        return value
+
+    def _validate_ipv6_address(self, netloc: str) -> None:
+        """
+        Check netloc correctness if it's an IPv6 address.
+        """
+        potential_ipv6_regex = r"^\[(.+)\](?::[0-9]{1,5})?$"
+        if netloc_match := re.search(potential_ipv6_regex, netloc):
+            potential_ip = netloc_match[1]
+            if not is_valid_ipv6_address(potential_ip):
+                raise argparse.ArgumentTypeError(self.message)
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,4 +1,7 @@
 from argparse import Namespace
+from pathlib import Path
+
+import pytest
 
 from cc2olx.cli import parse_args
 
@@ -16,7 +19,13 @@ def test_parse_args(imscc_file):
     )
 
     assert parsed_args == Namespace(
-        inputs=[imscc_file], loglevel="INFO", result="folder", link_file=None, passport_file=None, output="output"
+        inputs=[imscc_file],
+        loglevel="INFO",
+        result="folder",
+        link_file=None,
+        passport_file=None,
+        output="output",
+        relative_links_source=None,
     )
 
 
@@ -34,6 +43,7 @@ def test_parse_args_csv_file(imscc_file, link_map_csv):
         link_file=link_map_csv,
         passport_file=None,
         output="output",
+        relative_links_source=None,
     )
 
 
@@ -49,4 +59,34 @@ def test_parse_args_passport_file(imscc_file, passports_csv):
         link_file=None,
         passport_file=passports_csv,
         output="output",
+        relative_links_source=None,
+    )
+
+
+def test_parse_args_with_correct_relative_links_source(imscc_file: Path) -> None:
+    """
+    Positive input test for relative links source argument.
+    """
+    relative_links_source = "https://example.com"
+
+    parsed_args = parse_args(["-i", str(imscc_file), "-s", relative_links_source])
+
+    assert parsed_args == Namespace(
+        inputs=[imscc_file],
+        loglevel="INFO",
+        result="folder",
+        link_file=None,
+        passport_file=None,
+        output="output",
+        relative_links_source=relative_links_source,
     )
+
+
+def test_parse_args_with_incorrect_relative_links_source(imscc_file: Path) -> None:
+    """
+    Test arguments parser detects incorrect relative links sources.
+    """
+    relative_links_source = "ws://example.com"
+
+    with pytest.raises(SystemExit):
+        parse_args(["-i", str(imscc_file), "-s", relative_links_source])
diff --git a/tests/test_settings.py b/tests/test_settings.py
@@ -19,4 +19,5 @@ def test_collect_settings(imscc_file):
             "level": parsed_args.loglevel,
             "format": "{%(filename)s:%(lineno)d} - %(message)s",
         },
+        "relative_links_source": None,
     }
diff --git a/tests/test_validators/__init__.py b/tests/test_validators/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		OLX_STATIC_DIR = "static"
		OLX_STATIC_PATH_TEMPLATE = f"/{OLX_STATIC_DIR}/{{static_filename}}"