diff --git a/README.md b/README.md index 65c33ca..39e9ecb 100644 --- a/README.md +++ b/README.md @@ -36,5 +36,14 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# +### Install poetry virtual environment +``` +poetry install +``` + +### Activate poetry environment +``` +poetry shell +``` -python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname IP --port 8899 --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin --ontoFormat ntriples --ontoVersion originalFailoverLive --ontoPrecedence enforcedPriority \ No newline at end of file +python3 ontologytimemachine/custom_proxy.py --ontoFormat ntriples --ontoVersion originalFailoverLiveLatest --ontoPrecedence enforcedPriority \ No newline at end of file diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index bed6db2..9d07327 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -1,116 +1,132 @@ from proxy.http.proxy import HttpProxyBasePlugin from proxy.http.parser import HttpParser from proxy.common.utils import build_http_response -from ontologytimemachine.utils.utils import parse_arguments from ontologytimemachine.utils.mock_responses import mock_response_403 from ontologytimemachine.proxy_wrapper import HttpRequestWrapper -from ontologytimemachine.utils.proxy_logic import proxy_logic, is_ontology_request_only_ontology -from ontologytimemachine.utils.proxy_logic import is_archivo_ontology_request -from ontologytimemachine.utils.proxy_logic import if_intercept_host +from ontologytimemachine.utils.proxy_logic import ( + get_response_from_request, + if_intercept_host, + is_archivo_ontology_request, +) +from ontologytimemachine.utils.config import Config, parse_arguments from http.client import responses import proxy import sys import logging -IP = '0.0.0.0' -PORT = '8899' +IP = "0.0.0.0" +PORT = "8899" -config = ({'format': 'turtle', 'precedence': 'enforcedPriority', 'patchAcceptUpstream': False}, 'originalFailoverLiveLatest', False, 'all', False, True, None, None) +config = None -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): - logger.info('Init') + logger.info("Init") super().__init__(*args, **kwargs) - (self.ontoFormat, self.ontoVersion, self.restrictedAccess, - self.httpsInterception, self.disableRemovingRedirects, - self.forward_headers, self.timestamp, self.manifest) = config + self.config = config - def before_upstream_connection(self, request: HttpParser): - logger.info('Before upstream connection hook') - logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: + print(config) + logger.info("Before upstream connection hook") + logger.info( + f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" + ) wrapped_request = HttpRequestWrapper(request) if wrapped_request.is_connect_request(): - logger.info(f'HTTPS interception mode: {self.httpsInterception}') + logger.info(f"HTTPS interception mode: {self.config.httpsInterception}") + # Only intercept if interception is enabled - # Move this to the utils - if if_intercept_host(self.httpsInterception): - logger.info('HTTPS interception is on, forwardig the request') + if if_intercept_host(self.config): + logger.info("HTTPS interception is on, forwardig the request") return request else: - logger.info('HTTPS interception is turned off') + logger.info("HTTPS interception is blocked") return None - # If only ontology mode, return None in all other cases - if is_ontology_request_only_ontology(wrapped_request, self.restrictedAccess): - logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') - self.queue_response(mock_response_403) - return None - - if is_archivo_ontology_request(wrapped_request): - logger.debug('The request is for an ontology') - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) + # # If only ontology mode, return None in all other cases + logger.info(f"Config: {self.config}") + response = get_response_from_request(wrapped_request, self.config) + if response: self.queue_response(response) return None - return request - def handle_client_request(self, request: HttpParser): - logger.info('Handle client request hook') - logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + return request - wrapped_request = HttpRequestWrapper(request) - if wrapped_request.is_connect_request(): - return request + def do_intercept(self, _request: HttpParser) -> bool: + wrapped_request = HttpRequestWrapper(_request) + if self.config.httpsInterception in ["all"]: + return True + elif self.config.httpsInterception in ["none"]: + return False + elif self.config.httpsInterception in ["archivo"]: + if is_archivo_ontology_request(wrapped_request): + return True + return False + else: + logger.info( + f"httpsInterception: {self.config.httpsInterception} option is not allowed." + ) + return False - is_ontology_request = is_archivo_ontology_request(wrapped_request) - if not is_ontology_request: - logger.info('The requested IRI is not part of DBpedia Archivo') - return request + def handle_client_request(self, request: HttpParser) -> HttpParser: + logger.info("Handle client request hook") + logger.info( + f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" + ) - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) - self.queue_response(response) + return request - return None - def handle_upstream_chunk(self, chunk: memoryview): return chunk def queue_response(self, response): self.client.queue( build_http_response( - response.status_code, - reason=bytes(responses[response.status_code], 'utf-8'), + response.status_code, + reason=bytes(responses[response.status_code], "utf-8"), headers={ - b'Content-Type': bytes(response.headers.get('Content-Type'), 'utf-8') - }, - body=response.content + b"Content-Type": bytes( + response.headers.get("Content-Type"), "utf-8" + ) + }, + body=response.content, ) ) -if __name__ == '__main__': +if __name__ == "__main__": config = parse_arguments() sys.argv = [sys.argv[0]] # check it https interception is enabled - if config[3] != 'none': + if config.httpsInterception != "none": sys.argv += [ - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', + "--ca-key-file", + "ca-key.pem", + "--ca-cert-file", + "ca-cert.pem", + "--ca-signing-key-file", + "ca-signing-key.pem", ] sys.argv += [ - '--hostname', IP, - '--port', PORT, - '--plugins', __name__ + '.OntologyTimeMachinePlugin' + "--hostname", + IP, + "--port", + PORT, + "--plugins", + __name__ + ".OntologyTimeMachinePlugin", ] logger.info("Starting OntologyTimeMachineProxy server...") - proxy.main() \ No newline at end of file + proxy.main() diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index fe5d3db..6829154 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -1,14 +1,17 @@ from abc import ABC, abstractmethod from proxy.http.parser import HttpParser import logging +from typing import Tuple, Dict, Any - -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +# Configure logger +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) class AbstractRequestWrapper(ABC): - def __init__(self, request): + def __init__(self, request: Any) -> None: self.request = request @abstractmethod @@ -28,74 +31,80 @@ def is_https_request(self) -> bool: pass @abstractmethod - def get_request(self): + def get_request_host(self) -> str: + pass + + @abstractmethod + def get_request_path(self) -> str: pass @abstractmethod - def get_request_headers(self): + def get_request_headers(self) -> Dict[str, str]: pass @abstractmethod - def get_request_accept_header(self): + def get_request_accept_header(self) -> str: pass @abstractmethod - def set_request_accept_header(self, mime_type): + def set_request_accept_header(self, mime_type: str) -> None: pass @abstractmethod - def get_ontology_from_request(self): + def get_request_url_host_path(self) -> Tuple[str, str, str]: pass class HttpRequestWrapper(AbstractRequestWrapper): - def __init__(self, request: HttpParser): + def __init__(self, request: HttpParser) -> None: super().__init__(request) def is_get_request(self) -> bool: - return self.request.method == b'GET' + return self.request.method == b"GET" - def is_connect_request(self): - return self.request.method == b'CONNECT' + def is_connect_request(self) -> bool: + return self.request.method == b"CONNECT" - def is_head_request(self): - return self.request.method == b'HEAD' + def is_head_request(self) -> bool: + return self.request.method == b"HEAD" - def is_https_request(self): - return self.request.method == b'CONNECT' or self.request.headers.get(b'Host', b'').startswith(b'https') + def is_https_request(self) -> bool: + return self.request.method == b"CONNECT" or self.request.headers.get( + b"Host", b"" + ).startswith(b"https") + + def get_request_host(self) -> str: + return self.request.host.decode("utf-8") - def get_request(self): - return self.request + def get_request_path(self) -> str: + return self.request.path.decode("utf-8") - def get_request_headers(self): - headers = {} + def get_request_headers(self) -> Dict[str, str]: + headers: Dict[str, str] = {} for k, v in self.request.headers.items(): - headers[v[0].decode('utf-8')] = v[1].decode('utf-8') + headers[v[0].decode("utf-8")] = v[1].decode("utf-8") return headers - def get_request_accept_header(self): - logger.info('Wrapper - get_request_accept_header') - return self.request.headers[b'accept'][1].decode('utf-8') - - def set_request_accept_header(self, mime_type): - self.request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) + def get_request_accept_header(self) -> str: + logger.info("Wrapper - get_request_accept_header") + return self.request.headers[b"accept"][1].decode("utf-8") + + def set_request_accept_header(self, mime_type: str) -> None: + self.request.headers[b"accept"] = (b"Accept", mime_type.encode("utf-8")) logger.info(f'Accept header set to: {self.request.headers[b"accept"][1]}') - - def get_ontology_from_request(self): - logger.info('Get ontology from request') - print(f'Request protocol: {self.request.protocol}') - print(f'Request host: {self.request.host}') - print(f'Request _url: {self.request._url}') - print(f'Request path: {self.request.path}') - if (self.request.method == b'GET' or self.request.method == b'HEAD') and not self.request.host: + + def get_request_url_host_path(self) -> Tuple[str, str, str]: + logger.info("Get ontology from request") + if (self.request.method in {b"GET", b"HEAD"}) and not self.request.host: for k, v in self.request.headers.items(): - if v[0].decode('utf-8') == 'Host': - host = v[1].decode('utf-8') - path = self.request.path.decode('utf-8') - ontology = 'https://' + host + path + if v[0].decode("utf-8") == "Host": + host = v[1].decode("utf-8") + path = self.request.path.decode("utf-8") + url = f"https://{host}{path}" else: - host = self.request.host.decode('utf-8') - path = self.request.path.decode('utf-8') - ontology = str(self.request._url) - logger.info(f'Ontology: {ontology}') - return ontology, host, path \ No newline at end of file + host = self.request.host.decode("utf-8") + path = self.request.path.decode("utf-8") + url = str(self.request._url) + + logger.info(f"Ontology: {url}") + return url, host, path diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py new file mode 100644 index 0000000..48093d1 --- /dev/null +++ b/ontologytimemachine/utils/config.py @@ -0,0 +1,169 @@ +import argparse +from dataclasses import dataclass +from enum import Enum +from typing import Dict, Any + + +class LogLevel(Enum): + DEBUG = "debug" + INFO = "info" + WARNING = "warning" + ERROR = "error" + + +class OntoFormat(Enum): + TURTLE = "turtle" + NTRIPLES = "ntriples" + RDFXML = "rdfxml" + HTMLDOCU = "htmldocu" + + +class OntoPrecedence(Enum): + DEFAULT = "default" + ENFORCED_PRIORITY = "enforcedPriority" + ALWAYS = "always" + + +class OntoVersion(Enum): + ORIGINAL = "original" + ORIGINAL_FAILOVER_LIVE_LATEST = "originalFailoverLiveLatest" + LATEST_ARCHIVED = "latestArchived" + TIMESTAMP_ARCHIVED = "timestampArchived" + DEPENDENCY_MANIFEST = "dependencyManifest" + + +class HttpsInterception(Enum): + NONE = "none" + ALL = "all" + BLOCK = "block" + ARCHIVO = "archivo" + + +@dataclass +class Config: + logLevel: LogLevel = LogLevel.INFO + ontoFormat: Dict[str, Any] = None + ontoVersion: OntoVersion = (OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST,) + restrictedAccess: bool = False + httpsInterception: HttpsInterception = (HttpsInterception.ALL,) + disableRemovingRedirects: bool = False + timestamp: str = "" + # manifest: Dict[str, Any] = None + + +def enum_parser(enum_class, value): + value_lower = value.lower() + try: + return next(e.value for e in enum_class if e.value.lower() == value_lower) + except StopIteration: + valid_options = ", ".join([e.value for e in enum_class]) + raise ValueError( + f"Invalid value '{value}'. Available options are: {valid_options}" + ) + + +def parse_arguments() -> Config: + parser = argparse.ArgumentParser(description="Process ontology format and version.") + + # Defining ontoFormat argument with nested options + parser.add_argument( + "--ontoFormat", + type=lambda s: enum_parser(OntoFormat, s), + default=OntoFormat.TURTLE.value, + help="Format of the ontology: turtle, ntriples, rdfxml, htmldocu", + ) + + parser.add_argument( + "--ontoPrecedence", + type=lambda s: enum_parser(OntoPrecedence, s), + default=OntoPrecedence.ENFORCED_PRIORITY.value, + help="Precedence of the ontology: default, enforcedPriority, always", + ) + + parser.add_argument( + "--patchAcceptUpstream", + type=bool, + default=False, + help="Defines if the Accept Header is patched upstream in original mode.", + ) + + # Defining ontoVersion argument + parser.add_argument( + "--ontoVersion", + type=lambda s: enum_parser(OntoVersion, s), + default=OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST.value, + help="Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest", + ) + + # Enable/disable mode to only proxy requests to ontologies + parser.add_argument( + "--restrictedAccess", + type=bool, + default=False, + help="Enable/disable mode to only proxy requests to ontologies stored in Archivo.", + ) + + # Enable HTTPS interception for specific domains + parser.add_argument( + "--httpsInterception", + type=lambda s: enum_parser(HttpsInterception, s), + default=HttpsInterception.ALL.value, + help="Enable HTTPS interception for specific domains: none, archivo, all, listfilename.", + ) + + # Enable/disable inspecting or removing redirects + parser.add_argument( + "--disableRemovingRedirects", + type=bool, + default=False, + help="Enable/disable inspecting or removing redirects.", + ) + + # Log level + parser.add_argument( + "--logLevel", + type=lambda s: enum_parser(LogLevel, s), + default=LogLevel.INFO.value, + help="Level of the logging: debug, info, warning, error.", + ) + + args = parser.parse_args() + + # Check the value of --ontoVersion and prompt for additional arguments if needed + if args.ontoVersion == "timestampArchived": + args.timestamp = input("Please provide the timestamp (e.g., YYYY-MM-DD): ") + # Commenting manifest related code as it is not supported in the current version + # elif args.ontoVersion == 'dependencyManifest': + # args.manifest = input('Please provide the manifest file path: ') + + # Accessing the arguments + if hasattr(args, "timestamp"): + timestamp = args.timestamp + else: + timestamp = None + + # if hasattr(args, 'manifest'): + # logger.info(f"Manifest File Path: {args.manifest}") + # manifest = args.manifest + # else: + # manifest = None + + # Create ontoFormat dictionary + ontoFormat = { + "format": args.ontoFormat, + "precedence": args.ontoPrecedence, + "patchAcceptUpstream": args.patchAcceptUpstream, + } + + # Initialize the Config class with parsed arguments + config = Config( + logLevel=args.logLevel, + ontoFormat=ontoFormat, + ontoVersion=args.ontoVersion, + restrictedAccess=args.restrictedAccess, + httpsInterception=args.httpsInterception, + disableRemovingRedirects=args.disableRemovingRedirects, + timestamp=args.timestamp if hasattr(args, "timestamp") else "", + ) + + return config diff --git a/ontologytimemachine/utils/download_archivo_urls.py b/ontologytimemachine/utils/download_archivo_urls.py new file mode 100644 index 0000000..ed3065c --- /dev/null +++ b/ontologytimemachine/utils/download_archivo_urls.py @@ -0,0 +1,139 @@ +import os +import hashlib +import logging +import requests +import schedule +import time +import csv +from datetime import datetime, timedelta +from urllib.parse import urlparse +from typing import Set, Tuple + + +ARCHIVO_PARSED_URLS: Set[Tuple[str, str]] = set() + + +ARCHIVO_FILE_PATH = "ontologytimemachine/utils/archivo_ontologies_download.txt" +ARCHIVO_URL = "https://databus.dbpedia.org/ontologies/archivo-indices/ontologies/2024.07.26-220000/ontologies_type=official.csv" +HASH_FILE_PATH = "ontologytimemachine/utils/archivo_ontologies_hash.txt" + + +LAST_DOWNLOAD_TIMESTAMP = None +DOWNLOAD_INTERVAL = timedelta(days=1) # 1 day interval for checking the download + + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def schedule_daily_download(): + """Schedule the download to run at 3 AM every day.""" + schedule.every().day.at("03:00").do(download_archivo_urls) + + while True: + schedule.run_pending() + time.sleep(60) # Check every minute if there’s a scheduled task + + +# Start the scheduler in the background +def start_scheduler(): + logger.info("Starting the scheduler for daily archivo ontology download.") + schedule_daily_download() + + +# Function to calculate hash of the downloaded file +def calculate_file_hash(file_path): + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + +# Function to download and update archivo URLs file +def download_archivo_urls(): + """Download the archivo ontologies file, extract the first column, and save to a text file if a new version is available.""" + try: + logger.info("Checking for new version of archivo ontologies") + + # Download the latest archivo ontologies CSV + response = requests.get(ARCHIVO_URL) + response.raise_for_status() # Ensure the request was successful + + # Save the file temporarily to calculate the hash + temp_file_path = "temp_ontology_indices.csv" + with open(temp_file_path, "wb") as temp_file: + temp_file.write(response.content) + + # Calculate the hash of the new file + new_file_hash = calculate_file_hash(temp_file_path) + + # Compare with the existing hash if available + if os.path.exists(HASH_FILE_PATH): + with open(HASH_FILE_PATH, "r") as hash_file: + old_file_hash = hash_file.read().strip() + else: + old_file_hash = None + + if new_file_hash != old_file_hash: + # New version detected, extract the first column and save to the text file + with open(temp_file_path, "r", newline="", encoding="utf-8") as csv_file: + csv_reader = csv.reader(csv_file, delimiter=",") + with open(ARCHIVO_FILE_PATH, "w") as txt_file: + for row in csv_reader: + if row: # Ensure row is not empty + print(row) + txt_file.write( + row[0].strip() + "\n" + ) # Write only the first column (URL) to the text file + + # Save the new hash + with open(HASH_FILE_PATH, "w") as hash_file: + hash_file.write(new_file_hash) + + logger.info("New version of archivo ontologies downloaded and saved.") + else: + # No new version, remove the temporary file + os.remove(temp_file_path) + logger.info("No new version of archivo ontologies detected.") + + # Update the last download timestamp + global LAST_DOWNLOAD_TIMESTAMP + LAST_DOWNLOAD_TIMESTAMP = datetime.now() + + except requests.RequestException as e: + logger.error(f"Failed to download archivo ontologies: {e}") + + +def load_archivo_urls(): + """Load the archivo URLs into the global variable if not already loaded or if a day has passed since the last download.""" + global ARCHIVO_PARSED_URLS + global LAST_DOWNLOAD_TIMESTAMP + + # Check if ARCHIVO_PARSED_URLS is empty or the last download was over a day ago + if not ARCHIVO_PARSED_URLS or ( + LAST_DOWNLOAD_TIMESTAMP is None + or datetime.now() - LAST_DOWNLOAD_TIMESTAMP > DOWNLOAD_INTERVAL + ): + logger.info( + "ARCHIVO_PARSED_URLS is empty or more than a day has passed since the last download." + ) + download_archivo_urls() + + # Load archivo URLs after downloading or if already present + if not ARCHIVO_PARSED_URLS: # Load only if the set is empty + logger.info("Loading archivo ontologies from file") + try: + with open(ARCHIVO_FILE_PATH, "r") as file: + ARCHIVO_PARSED_URLS = { + (urlparse(line.strip()).netloc, urlparse(line.strip()).path) + for line in file + } + logger.info(f"Loaded {len(ARCHIVO_PARSED_URLS)} ontology URLs.") + + except FileNotFoundError: + logger.error("Archivo ontology file not found.") + except Exception as e: + logger.error(f"Error loading archivo ontology URLs: {e}") diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 0837dba..c479547 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -1,131 +1,212 @@ import logging import requests -import rdflib -from urllib.parse import urlparse +from ontologytimemachine.utils.utils import ( + set_onto_format_headers, + get_format_from_accept_header, +) +from ontologytimemachine.utils.download_archivo_urls import load_archivo_urls +from ontologytimemachine.utils.utils import ( + parse_accept_header_with_priority, + archivo_api, + passthrough_status_codes, +) +from ontologytimemachine.utils.mock_responses import ( + mock_response_403, + mock_response_404, + mock_response_500, +) +from typing import Set, Tuple + + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + -from ontologytimemachine.utils.utils import set_onto_format_headers, get_format_from_accept_header -from ontologytimemachine.utils.utils import parse_accept_header_with_priority -from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes -from ontologytimemachine.utils.mock_responses import mock_response_500 -from ontologytimemachine.utils.mock_responses import mock_response_404 +def if_intercept_host(config): + if config.httpsInterception in ["none", "all"]: + return True + elif config.httpsInterception in ["block"]: + return False + return False -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) +def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, only_ontologies): + if only_ontologies: + print(only_ontologies) + is_archivo_ontology = is_archivo_ontology_request(wrapped_request) + if not is_archivo_ontology: + return True + return False + + +def get_response_from_request(wrapped_request, config): + do_deny = do_deny_request_due_non_archivo_ontology_uri( + wrapped_request, config.restrictedAccess + ) + if do_deny: + logger.warning( + "Request denied: not an ontology request and only ontologies mode is enabled" + ) + return mock_response_403 + + response = proxy_logic(wrapped_request, config) + return response + +def is_archivo_ontology_request(wrapped_request): + """Check if the requested ontology is in the archivo.""" + logger.info("Check if the requested ontology is in archivo") + + # Ensure the archivo URLs are loaded + load_archivo_urls() + from ontologytimemachine.utils.download_archivo_urls import ARCHIVO_PARSED_URLS + + # Extract the request's host and path + request_host = wrapped_request.get_request_host() + request_path = wrapped_request.get_request_path() + + print(f"Host: {request_host}") + print(f"Path: {request_path}") + print((request_host, request_path)) + print(list(ARCHIVO_PARSED_URLS)[0]) + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True -def if_intercept_host(https_intercept): - if https_intercept in ['all']: + # Remove last hash and check again + if request_path.endswith("/"): + request_path = request_path.rstrip("/") + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") return True - return False + # Cut the last part of the path -def is_ontology_request_only_ontology(wrapped_request, only_ontologies): - is_archivo_ontology = is_archivo_ontology_request(wrapped_request) - if only_ontologies and not is_archivo_ontology: + path_parts = request_path.split("/") + new_path = "/".join(path_parts[:-1]) + print(f"New path: {new_path}") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") return True - return False + new_path = "/".join(path_parts[:-2]) + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True -def is_archivo_ontology_request(wrapped_request): - logger.info('Chekc if the requested ontology is in archivo') - with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: - urls = [line.strip() for line in file] - parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] - - _, request_host, request_path = wrapped_request.get_ontology_from_request() - for host, path in parsed_urls: - if request_host == host and request_path.startswith(path): - return True + logger.info(f"Requested URL: {request_host+request_path} is NOT in Archivo") return False def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): allow_redirects = not disableRemovingRedirects try: - response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=5) - logger.info('Successfully fetched original ontology') + response = requests.get( + url=url, headers=headers, allow_redirects=allow_redirects, timeout=5 + ) + logger.info("Successfully fetched original ontology") return response except Exception as e: - logger.error(f'Error fetching original ontology: {e}') + logger.error(f"Error fetching original ontology: {e}") return mock_response_404() -def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp, manifest): - logger.info('Proxy has to intervene') +# change the function definition and pass only the config +def proxy_logic(wrapped_request, config): + logger.info("Proxy has to intervene") - set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) + set_onto_format_headers(wrapped_request, config) headers = wrapped_request.get_request_headers() - ontology, _, _ = wrapped_request.get_ontology_from_request() + ontology, _, _ = wrapped_request.get_request_url_host_path() # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through format = get_format_from_accept_header(headers) - if not format and ontoVersion != 'original': - logger.info(f'No format can be used from Archivo') + if not format and config.ontoVersion != "original": + logger.info(f"No format can be used from Archivo") return mock_response_500 - - if ontoVersion == 'original': - response = fetch_original(ontology, headers, disableRemovingRedirects) - elif ontoVersion == 'originalFailoverLiveLatest': - response = fetch_failover(ontology, headers, disableRemovingRedirects) - elif ontoVersion == 'latestArchived': - response = fetch_latest_archived(ontology, headers) - elif ontoVersion == 'timestampArchived': - response = fetch_timestamp_archived(ontology, headers, timestamp) - elif ontoVersion == 'dependencyManifest': - response = fetch_dependency_manifest(ontology, headers, manifest) + + if config.ontoVersion == "original": + response = fetch_original(ontology, headers, config) + elif config.ontoVersion == "originalFailoverLiveLatest": + response = fetch_failover( + wrapped_request, ontology, headers, config.disableRemovingRedirects + ) + elif config.ontoVersion == "latestArchived": + response = fetch_latest_archived(wrapped_request, ontology, headers) + elif config.ontoVersion == "timestampArchived": + response = fetch_timestamp_archived(wrapped_request, ontology, headers, config) + # Commenting the manifest related part because it is not supported in the current version + # elif ontoVersion == 'dependencyManifest': + # response = fetch_dependency_manifest(ontology, headers, manifest) return response # Fetch from the original source, no matter what def fetch_original(ontology, headers, disableRemovingRedirects): - logger.info(f'Fetching original ontology from URL: {ontology}') + logger.info(f"Fetching original ontology from URL: {ontology}") return request_ontology(ontology, headers, disableRemovingRedirects) # Failover mode -def fetch_failover(ontology, headers, disableRemovingRedirects): - logger.info(f'Fetching original ontology with failover from URL: {ontology}') +def fetch_failover(wrapped_request, ontology, headers, disableRemovingRedirects): + logger.info(f"Fetching original ontology with failover from URL: {ontology}") original_response = request_ontology(ontology, headers, disableRemovingRedirects) if original_response.status_code in passthrough_status_codes: - requested_mimetypes_with_priority = parse_accept_header_with_priority(headers['Accept']) + requested_mimetypes_with_priority = parse_accept_header_with_priority( + headers["Accept"] + ) requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] - response_mime_type = original_response.headers.get('Content-Type', ';').split(';')[0] - logger.info(f'Requested mimetypes: {requested_mimetypes}') - logger.info(f'Response mimetype: {response_mime_type}') + response_mime_type = original_response.headers.get("Content-Type", ";").split( + ";" + )[0] + logger.info(f"Requested mimetypes: {requested_mimetypes}") + logger.info(f"Response mimetype: {response_mime_type}") if response_mime_type in requested_mimetypes: - return original_response + return original_response else: - logging.info(f'The returned type is not the same as the requested one') - return fetch_latest_archived(ontology, headers) + logging.info(f"The returned type is not the same as the requested one") + return fetch_latest_archived(wrapped_request, ontology, headers) else: - logger.info(f'The returend status code is not accepted: {original_response.status_code}') - return fetch_latest_archived(ontology, headers) + logger.info( + f"The returend status code is not accepted: {original_response.status_code}" + ) + return fetch_latest_archived(wrapped_request, ontology, headers) # Fetch the lates version from archivo (no timestamp defined) -def fetch_latest_archived(ontology, headers): - logger.info('Fetch latest archived') +def fetch_latest_archived(wrapped_request, ontology, headers): + if not is_archivo_ontology_request(wrapped_request): + logger.info( + "Data needs to be fetched from Archivo, but ontology is not available on Archivo." + ) + return mock_response_404() + logger.info("Fetch latest archived") format = get_format_from_accept_header(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}" + logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) - -def fetch_timestamp_archived(ontology, headers, timestamp): - logger.info('Fetch archivo timestamp') +def fetch_timestamp_archived(wrapped_request, ontology, headers, config): + if not is_archivo_ontology_request(wrapped_request): + logger.info( + "Data needs to be fetched from Archivo, but ontology is not available on Archivo." + ) + return mock_response_404() + logger.info("Fetch archivo timestamp") format = get_format_from_accept_header(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}&v={timestamp}' - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={config.timestamp}" + logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) def fetch_dependency_manifest(ontology, headers, manifest): - logger.info(f'The dependency manifest is currently not supported') + logger.info(f"The dependency manifest is currently not supported") return mock_response_500 # # Parse RDF data from the dependencies file # manifest_g = rdflib.Graph() @@ -135,24 +216,24 @@ def fetch_dependency_manifest(ontology, headers, manifest): # # Extract dependencies related to the ontology link # ontology = rdflib.URIRef(ontology) - + # dependencies = manifest_g.subjects(predicate=version_namespace.dependency, object=ontology) # for dependency in dependencies: # dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) # dep_file = g.value(subject=dependency, predicate=version_namespace.file) - + # # Make request to DBpedia archive API # if dep_file: # version_param = dep_file.split('v=')[1] - # api_url = f"{dbpedia_api}?o={ontology}&v={version_param}" + # api_url = f"{archivo_api}?o={ontology}&v={version_param}" # else: - # api_url = f"{dbpedia_api}?o={ontology}" - + # api_url = f"{archivo_api}?o={ontology}" + # response = requests.get(api_url) # if response.status_code == 200: # logger.info(f"Successfully fetched {api_url}") # return response # else: # logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") - # return mock_response_404 \ No newline at end of file + # return mock_response_404 diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 8c48945..36075c7 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -3,128 +3,101 @@ from werkzeug.http import parse_accept_header -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) -dbpedia_api = 'https://archivo.dbpedia.org/download' -archivo_mimetypes = ['application/rdf+xml', 'application/owl+xml', 'text/turtle', 'application/n-triples'] +archivo_api = "https://archivo.dbpedia.org/download" +archivo_mimetypes = [ + "application/rdf+xml", + "application/owl+xml", + "text/turtle", + "application/n-triples", +] passthrough_status_codes = [ - 100, 101, 102, 103, + 100, + 101, + 102, + 103, 200, - 300, 301, 302, 303, 304, 307, 308, + 300, + 301, + 302, + 303, + 304, + 307, + 308, 451, ] -def parse_arguments(): - parser = argparse.ArgumentParser(description='Process ontology format and version.') - - # Defining ontoFormat argument with nested options - parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], - default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') - - parser.add_argument('--ontoPrecedence', type=str, choices=['default', 'enforcedPriority', 'always'], - default='enforcedPriority', help='Precedence of the ontology: default, enforcedPriority, always') - - parser.add_argument('--patchAcceptUpstream', type=bool, default=False, - help='Defines if the Accept Header is patched upstream in original mode.') - - # Defining ontoVersion argument - parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLiveLatest', - 'latestArchived', 'timestampArchived', 'dependencyManifest'], - default='originalFailoverLiveLatest', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') - - # Enable/disable mode to only proxy requests to ontologies - parser.add_argument('--restrictedAccess', type=bool, default=False, - help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') - - # Enable HTTPS interception for specific domains - parser.add_argument('--httpsInterception', type=str, choices=['none', 'all'], - default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') - - # Enable/disable inspecting or removing redirects - parser.add_argument('--disableRemovingRedirects', type=bool, default=False, - help='Enable/disable inspecting or removing redirects.') - - # Enable/disable proxy forward headers - parser.add_argument('--forwardHeaders', type=bool, default=True, - help='Enable/disable proxy forward headers.') - - args = parser.parse_args() - - # Check the value of --ontoVersion and prompt for additional arguments if needed - if args.ontoVersion == 'timestampArchived': - args.timestamp = input('Please provide the timestamp (e.g., YYYY-MM-DD): ') - elif args.ontoVersion == 'dependencyManifest': - args.manifest = input('Please provide the manifest file path: ') - - # Accessing the arguments - logger.info(f"Selected Ontology Version: {args.ontoVersion}") - if hasattr(args, 'timestamp'): - logger.info(f"Timestamp: {args.timestamp}") - timestamp = args.timestamp - else: - timestamp = None - - if hasattr(args, 'manifest'): - logger.info(f"Manifest File Path: {args.manifest}") - manifest = args.manifest - else: - manifest = None - - ontoFormat = { - 'format': args.ontoFormat, - 'precedence': args.ontoPrecedence, - 'patchAcceptUpstream': args.patchAcceptUpstream +def get_mime_type(format="turtle"): + # Define a mapping of formats to MIME types + format_to_mime = { + "turtle": "text/turtle", + "ntriples": "application/n-triples", + "rdfxml": "application/rdf+xml", + "htmldocu": "text/html", } - logger.info(f'Ontology Format: {ontoFormat}') - logger.info(f'Ontology Version: {args.ontoVersion}') - logger.info(f'Only Ontologies Mode: {args.restrictedAccess}') - logger.info(f'HTTPS Interception: {args.httpsInterception}') - logger.info(f'Inspect Redirects: {args.disableRemovingRedirects}') - logger.info(f'Forward Headers: {args.forwardHeaders}') - return ontoFormat, args.ontoVersion, args.restrictedAccess, args.httpsInterception, args.disableRemovingRedirects, args.forwardHeaders, timestamp, manifest + # Return the MIME type based on the format or use a generic default + return format_to_mime.get(format, "text/turtle") -def get_mime_type(format='turtle'): - # Define a mapping of formats to MIME types - format_to_mime = { - 'turtle': 'text/turtle', - 'ntriples': 'application/n-triples', - 'rdfxml': 'application/rdf+xml', - 'htmldocu': 'text/html' +def map_mime_to_format(mime_type): + # Map file extensions to formats + mime_to_format = { + "application/rdf+xml": "owl", # Common MIME type for OWL files + "application/owl+xml": "owl", # Specific MIME type for OWL + "text/turtle": "ttl", # MIME type for Turtle format + "application/n-triples": "nt", # MIME type for N-Triples format } - - # Return the MIME type based on the format or use a generic default - return format_to_mime.get(format, 'text/turtle') + + return mime_to_format.get(mime_type, None) -def set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion): - logger.info(f'Setting headers based on ontoFormat: {ontoFormat} and ontoVersion: {ontoVersion}') +def set_onto_format_headers(wrapped_request, config): + logger.info( + f"Setting headers based on ontoFormat: {config.ontoFormat} and ontoVersion: {config.ontoVersion}" + ) # if ontoVersion is original and patchAcceptUpstream is False nothing to do here - if ontoVersion == 'original' and not ontoFormat['patchAcceptUpstream']: + if ( + config.ontoVersion == "original" + and not config.ontoFormat["patchAcceptUpstream"] + ): return - + # Determine the correct MIME type for the format - mime_type = get_mime_type(ontoFormat['format']) - logger.info(f'Requested mimetype by proxy: {mime_type}') + mime_type = get_mime_type(config.ontoFormat["format"]) + logger.info(f"Requested mimetype by proxy: {mime_type}") # Define conditions for modifying the accept header request_accept_header = wrapped_request.get_request_accept_header() - logger.info(f'Accept header by request: {request_accept_header}') + logger.info(f"Accept header by request: {request_accept_header}") req_headers_with_priority = parse_accept_header_with_priority(request_accept_header) req_headers = [x[0] for x in req_headers_with_priority] - if not req_headers and ontoFormat['precedence'] in ['default', ['enforcedPriority']]: + if not req_headers and config.ontoFormat["precedence"] in [ + "default", + ["enforcedPriority"], + ]: wrapped_request.set_request_accept_header(mime_type) - elif len(req_headers) == 1 and req_headers[0] == '*/*' and ontoFormat['precedence'] in ['default', 'enforcedPriority']: + elif ( + len(req_headers) == 1 + and req_headers[0] == "*/*" + and config.ontoFormat["precedence"] in ["default", "enforcedPriority"] + ): wrapped_request.set_request_accept_header(mime_type) - elif len(req_headers) > 1 and mime_type in req_headers and ontoFormat['precedence'] == 'enforcedPriority': + elif ( + len(req_headers) > 1 + and mime_type in req_headers + and config.ontoFormat["precedence"] == "enforcedPriority" + ): wrapped_request.set_request_accept_header(mime_type) - elif ontoFormat['precedence'] == 'always': + elif config.ontoFormat["precedence"] == "always": wrapped_request.set_request_accept_header(mime_type) @@ -136,7 +109,9 @@ def select_highest_priority_mime_from_archivo(mime_list): highest_priority = sorted_mime_list[0][1] # Filter MIME types that match the highest priority - highest_priority_mimes = [mime for mime, priority in sorted_mime_list if priority == highest_priority] + highest_priority_mimes = [ + mime for mime, priority in sorted_mime_list if priority == highest_priority + ] # Check if any of the highest priority MIME types are in the archivo list for mime in highest_priority_mimes: @@ -147,26 +122,14 @@ def select_highest_priority_mime_from_archivo(mime_list): return None -def map_mime_to_format(mime_type): - # Map file extensions to formats - mime_to_format = { - 'application/rdf+xml': 'owl', # Common MIME type for OWL files - 'application/owl+xml': 'owl', # Specific MIME type for OWL - 'text/turtle': 'ttl', # MIME type for Turtle format - 'application/n-triples': 'nt', # MIME type for N-Triples format - } - - return mime_to_format.get(mime_type, None) - - def parse_accept_header_with_priority(accept_header): - logger.info('Parse accept header') + logger.info("Parse accept header") # Parse the Accept header to extract MIME types and their priority (q values) parsed = parse_accept_header(accept_header) - + # Create a list of tuples with MIME types and their corresponding q values mime_types_with_priority = [(item[0], item[1]) for item in parsed] - logger.info(f'Accept headers with priority: {mime_types_with_priority}') + logger.info(f"Accept headers with priority: {mime_types_with_priority}") return mime_types_with_priority @@ -176,18 +139,20 @@ def get_format_from_accept_header(headers): return None # Map MIME types to formats - accept_header = headers.get('Accept', None) - logger.info(f'Accept header: {accept_header}') + accept_header = headers.get("Accept", None) + logger.info(f"Accept header: {accept_header}") if not accept_header: return None - + accept_header_with_priority = parse_accept_header_with_priority(accept_header) - - selected_mimetype = select_highest_priority_mime_from_archivo(accept_header_with_priority) + + selected_mimetype = select_highest_priority_mime_from_archivo( + accept_header_with_priority + ) if not selected_mimetype: - logger.info(f'The requested mimetype is not supported by DBpedia Archivo') + logger.info(f"The requested mimetype is not supported by DBpedia Archivo") return None - + format = map_mime_to_format(selected_mimetype) - return format \ No newline at end of file + return format diff --git a/poetry.lock b/poetry.lock index 6b8cef0..1e34442 100644 --- a/poetry.lock +++ b/poetry.lock @@ -137,15 +137,18 @@ test = ["pytest (>=6)"] [[package]] name = "idna" -version = "3.8" +version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" files = [ - {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"}, - {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, ] +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -268,13 +271,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "proxy-py" -version = "2.4.7" +version = "2.4.8" description = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework." optional = false python-versions = ">=3.6" files = [ - {file = "proxy.py-2.4.7-py3-none-any.whl", hash = "sha256:83ddfda5479403434eace531c2bdef41fd9091df473a4051cd9df1564de056a9"}, - {file = "proxy_py-2.4.7.tar.gz", hash = "sha256:2e20ad717025cdee92d528be1321b7af8743d941e56de2ae6f390c6dc67aaad1"}, + {file = "proxy.py-2.4.8-py3-none-any.whl", hash = "sha256:316cbed3184c8ddf4f9b3143f7dc449ef1d44a7c5ca1988276a01444f6426e51"}, + {file = "proxy_py-2.4.8.tar.gz", hash = "sha256:77088312aa558c9402af2b88d135a1e261af51f5e38242f1d37867559a0a65cb"}, ] [package.extras] @@ -299,13 +302,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "8.3.2" +version = "8.3.3" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, - {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, ] [package.dependencies] @@ -361,6 +364,20 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "schedule" +version = "1.2.2" +description = "Job scheduling for humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "schedule-1.2.2-py3-none-any.whl", hash = "sha256:5bef4a2a0183abf44046ae0d164cadcac21b1db011bdd8102e4a0c1e91e06a7d"}, + {file = "schedule-1.2.2.tar.gz", hash = "sha256:15fe9c75fe5fd9b9627f3f19cc0ef1420508f9f9a46f45cd0769ef75ede5f0b7"}, +] + +[package.extras] +timezone = ["pytz"] + [[package]] name = "six" version = "1.16.0" @@ -385,13 +402,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.2" +version = "2.2.3" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, - {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, + {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, + {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, ] [package.extras] @@ -420,4 +437,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "aa072190e1a5c335c379c9f3ab09b14dfcf718050b38b08441ba2a91ffefd935" +content-hash = "9efdbca22e8f7d122208d160253c194f4f3d177e77a011491bbaac34fac5c237" diff --git a/pyproject.toml b/pyproject.toml index 0232beb..ebce3c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ requests = "^2.32.3" proxy-py = "^2.4.4" rdflib = "^7.0.0" werkzeug = "^3.0.4" +schedule = "^1.2.2" [build-system] diff --git a/tests/oldtest_integration.py b/tests/oldtest_integration.py deleted file mode 100644 index d5d8b8e..0000000 --- a/tests/oldtest_integration.py +++ /dev/null @@ -1,193 +0,0 @@ -import pytest -import requests -import time -import subprocess -import itertools -from ontologytimemachine.custom_proxy import IP, PORT - - -PROXY = f'{IP}:{PORT}' -HTTP_PROXY = f'http://{PROXY}' -HTTPS_PROXY = f'http://{PROXY}' -PROXIES = { - "http": HTTP_PROXY, - "https": HTTPS_PROXY -} -CA_CERT_PATH = "ca-cert.pem" - - -@pytest.fixture(scope="module", autouse=True) -def start_proxy_server(): - # Start the proxy server in a subprocess - process = subprocess.Popen( - [ - 'python3', '-m', 'proxy', - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', - '--hostname', IP, - '--port', PORT, - '--plugins', 'ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin' - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - - # Wait a bit to ensure the server starts - time.sleep(5) - - yield - "http://0.0.0.0:8899" - # Terminate the proxy server after tests - process.terminate() - process.wait() - - -def test_babelnet(): - iri = 'http://babelnet.org/rdf/' - generic_test(iri, 'text/turtle') - - -def test_bag_basisregistraties(): - iri = 'http://bag.basisregistraties.overheid.nl/def/bag' - generic_test(iri, 'text/turtle') - - -def test_bblfish(): - iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' - generic_test(iri, 'text/turtle') - - -def test_brk_basisregistraties(): - iri = 'http://brk.basisregistraties.overheid.nl/def/brk' - generic_test(iri, 'text/turtle') - - -def test_brt_basisregistraties(): - iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' - generic_test(iri, 'text/turtle') - - -def test_brt_basisregistraties_begrippenkader(): - iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' - generic_test(iri, 'text/turtle') - - -def test_buzzword(): - iri = 'http://buzzword.org.uk/rdf/personal-link-types#' - generic_test(iri, 'text/turtle') - - -def test_catalogus_professorum(): - iri = 'http://catalogus-professorum.org/cpm/2/' - generic_test(iri, 'text/turtle') - - -def test_data_gov(): - iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' - generic_test(iri, 'text/turtle') - - -def test_data_bigdatagrapes(): - iri = 'http://data.bigdatagrapes.eu/resource/ontology/' - generic_test(iri, 'text/turtle') - - -def test_data_europa_esco(): - iri = 'http://data.europa.eu/esco/flow' - generic_test(iri, 'text/turtle') - - -def test_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') - - -def test_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') - - -def test_data_opendiscoveryspace(): - iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' - generic_test(iri, 'text/turtle') - - -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' - generic_test(iri, 'text/turtle') - - -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://dbpedia.org/ontology/Person' - generic_test(iri, 'text/turtle') - - -def test_linked_web_apis(): - iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' - generic_test(iri, 'text/turtle') - - -#def test_ontologi_es(): -# iri = 'http://ontologi.es/days#' -# generic_test(iri, 'text/turtle') - - -def test_https(): - iri = "https://www.w3id.org/simulation/ontology/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def test_https(): - iri = "https://vocab.eccenca.com/auth/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def not_test_all_iris(): - with open('tests/archivo_ontologies_test.txt', 'r') as file: - for line in file: - iri = line.strip() - if iri: # Ensure it's not an empty line - iri_generic_test(iri) - - -def generic_test(iri, content_type): - response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) - assert response.status_code == 200 - assert iri in response.content.decode('utf-8') - - -def iri_generic_test(iri): - try: - response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) - assert response.status_code == 200 - assert iri in response.content.decode('utf-8') - print(f"Test passed for IRI: {iri}") - except AssertionError: - print(f"Test failed for IRI: {iri}") - except requests.exceptions.RequestException as e: - print(f"Request failed for IRI: {iri}, Error: {e}") - - -def get_parameter_combinations(): -# Define the possible values for each parameter - ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] - ontoPrecedence = ['default', 'enforcedPriority', 'always'] - patchAcceptUpstream = [True, False] - ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', - 'latestArchive', 'timestampArchive', 'dependencyManifest'] - onlyOntologies = [True, False] - httpsIntercept = [True, False] - inspectRedirects = [True, False] - forwardHeaders = [True, False] - subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] - - combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, - onlyOntologies, httpsIntercept, inspectRedirects, - forwardHeaders, subjectBinarySearchThreshold)) - return combinations - - -if __name__ == '__main__': - - pytest.main() diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..8d1db7e --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,36 @@ +import unittest +from ontologytimemachine.utils.config import parse_arguments, Config +import sys + + +class TestConfig(unittest.TestCase): + + def test_parse_arguments(self): + test_args = [ + "test", + "--ontoFormat", + "turtle", + "--ontoPrecedence", + "enforcedPriority", + "--patchAcceptUpstream", + "False", + "--ontoVersion", + "original", + "--httpsInterception", + "none", + "--disableRemovingRedirects", + "False", + "--logLevel", + "info", + ] + sys.argv = test_args + config = parse_arguments() + self.assertIsInstance(config, Config) + self.assertEqual(config.ontoFormat["format"], "turtle") + self.assertEqual(config.ontoVersion, "original") + self.assertEqual(config.restrictedAccess, False) + self.assertEqual(config.httpsInterception, "none") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_integration.py b/tests/test_integration.py index e7a2ef6..472fa3d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,13 +6,10 @@ from ontologytimemachine.custom_proxy import IP, PORT -PROXY = f'{IP}:{PORT}' -HTTP_PROXY = f'http://{PROXY}' -HTTPS_PROXY = f'http://{PROXY}' -PROXIES = { - "http": HTTP_PROXY, - "https": HTTPS_PROXY -} +PROXY = f"{IP}:{PORT}" +HTTP_PROXY = f"http://{PROXY}" +HTTPS_PROXY = f"http://{PROXY}" +PROXIES = {"http": HTTP_PROXY, "https": HTTPS_PROXY} CA_CERT_PATH = "ca-cert.pem" @@ -21,15 +18,15 @@ # # Start the proxy server in a subprocess # process = subprocess.Popen( # [ -# 'python3', 'ontologytimemachine/custom_proxy.py', +# 'python3', 'ontologytimemachine/custom_proxy.py', # ], # stdout=subprocess.PIPE, # stderr=subprocess.PIPE # ) - + # # Wait a bit to ensure the server starts # time.sleep(5) - + # yield # "http://0.0.0.0:8899" # # Terminate the proxy server after tests @@ -38,90 +35,91 @@ def test_12_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') + iri = "http://data.globalchange.gov/gcis.owl" + generic_test(iri, "text/turtle") def test_13_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') + iri = "http://data.ontotext.com/resource/leak/" + generic_test(iri, "text/turtle") def test_1_babelnet(): - iri = 'http://babelnet.org/rdf/' - generic_test(iri, 'text/turtle') + iri = "http://babelnet.org/rdf/" + generic_test(iri, "text/turtle") + def test_2_bag_basisregistraties(): - iri = 'http://bag.basisregistraties.overheid.nl/def/bag' - generic_test(iri, 'text/turtle') + iri = "http://bag.basisregistraties.overheid.nl/def/bag" + generic_test(iri, "text/turtle") def test_3_bblfish(): - iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' - generic_test(iri, 'text/turtle') + iri = "http://bblfish.net/work/atom-owl/2006-06-06/" + generic_test(iri, "text/turtle") def test_4_brk_basisregistraties(): - iri = 'http://brk.basisregistraties.overheid.nl/def/brk' - generic_test(iri, 'text/turtle') + iri = "http://brk.basisregistraties.overheid.nl/def/brk" + generic_test(iri, "text/turtle") def test_5_brt_basisregistraties(): - iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' - generic_test(iri, 'text/turtle') + iri = "http://brt.basisregistraties.overheid.nl/def/top10nl" + generic_test(iri, "text/turtle") def test_6_brt_basisregistraties_begrippenkader(): - iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' - generic_test(iri, 'text/turtle') + iri = "http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl" + generic_test(iri, "text/turtle") def test_7_buzzword(): - iri = 'http://buzzword.org.uk/rdf/personal-link-types#' - generic_test(iri, 'text/turtle') + iri = "http://buzzword.org.uk/rdf/personal-link-types#" + generic_test(iri, "text/turtle") def test_8_catalogus_professorum(): - iri = 'http://catalogus-professorum.org/cpm/2/' - generic_test(iri, 'text/turtle') + iri = "http://catalogus-professorum.org/cpm/2/" + generic_test(iri, "text/turtle") def test_9_data_gov(): - iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' - generic_test(iri, 'text/turtle') + iri = "http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf" + generic_test(iri, "text/turtle") def test_10_data_bigdatagrapes(): - iri = 'http://data.bigdatagrapes.eu/resource/ontology/' - generic_test(iri, 'text/turtle') + iri = "http://data.bigdatagrapes.eu/resource/ontology/" + generic_test(iri, "text/turtle") def test_11_data_europa_esco(): - iri = 'http://data.europa.eu/esco/flow' - generic_test(iri, 'text/turtle') + iri = "http://data.europa.eu/esco/flow" + generic_test(iri, "text/turtle") def test_14_data_ordnancesurvey_50kGazetteer(): - iri = 'http://dbpedia.org/ontology/Person' - generic_test(iri, 'text/turtle') + iri = "http://dbpedia.org/ontology/Person" + generic_test(iri, "text/turtle") def test_15_linked_web_apis(): - iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' - generic_test(iri, 'text/turtle') + iri = "http://linked-web-apis.fit.cvut.cz/ns/core" + generic_test(iri, "text/turtle") def generic_test(iri, content_type): response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 - assert iri in response.content.decode('utf-8') + assert iri in response.content.decode("utf-8") def iri_generic_test(iri): try: response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 - assert iri in response.content.decode('utf-8') + assert iri in response.content.decode("utf-8") print(f"Test passed for IRI: {iri}") except AssertionError: print(f"Test failed for IRI: {iri}") @@ -130,24 +128,40 @@ def iri_generic_test(iri): def get_parameter_combinations(): -# Define the possible values for each parameter - ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] - ontoPrecedence = ['default', 'enforcedPriority', 'always'] - patchAcceptUpstream = [True, False] - ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', - 'latestArchive', 'timestampArchive', 'dependencyManifest'] - onlyOntologies = [True, False] - httpsIntercept = [True, False] - inspectRedirects = [True, False] - forwardHeaders = [True, False] - subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] - - combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, - onlyOntologies, httpsIntercept, inspectRedirects, - forwardHeaders, subjectBinarySearchThreshold)) - return combinations - - -if __name__ == '__main__': - + # Define the possible values for each parameter + ontoFormat = ["turtle", "ntriples", "rdfxml", "htmldocu"] + ontoPrecedence = ["default", "enforcedPriority", "always"] + patchAcceptUpstream = [True, False] + ontoVersion = [ + "original", + "originalFailoverLive", + "originalFailoverArchivoMonitor", + "latestArchive", + "timestampArchive", + "dependencyManifest", + ] + onlyOntologies = [True, False] + httpsIntercept = [True, False] + inspectRedirects = [True, False] + forwardHeaders = [True, False] + subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] + + combinations = list( + itertools.product( + ontoFormat, + ontoPrecedence, + patchAcceptUpstream, + ontoVersion, + onlyOntologies, + httpsIntercept, + inspectRedirects, + forwardHeaders, + subjectBinarySearchThreshold, + ) + ) + return combinations + + +if __name__ == "__main__": + pytest.main() diff --git a/tests/test_mock_responses.py b/tests/test_mock_responses.py new file mode 100644 index 0000000..a145d60 --- /dev/null +++ b/tests/test_mock_responses.py @@ -0,0 +1,34 @@ +import unittest +from ontologytimemachine.utils.mock_responses import ( + mock_response_200, + mock_response_403, + mock_response_404, + mock_response_500, +) + + +class TestMockResponses(unittest.TestCase): + + def test_mock_response_200(self): + response = mock_response_200() + self.assertEqual(response.status_code, 200) + self.assertIn("