From dd000b489169ed9dac0b0330692914ec7ff6bb5c Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 26 Jul 2024 12:48:33 +0200 Subject: [PATCH 01/35] first version of the dependency lock based mode --- ontologytimemachine/custom_proxy.py | 1 + ontologytimemachine/utils/dependency.ttl | 25 ++++++++ ontologytimemachine/utils/utils.py | 38 +++++++++++- poetry.lock | 62 ++++++++++++++++++- pyproject.toml | 1 + .../{test_proxy.py => oldtest_integration.py} | 21 +++++++ 6 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 ontologytimemachine/utils/dependency.ttl rename tests/{test_proxy.py => oldtest_integration.py} (81%) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 95cf79f..a8c40e8 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -103,5 +103,6 @@ def queue_response(self, response): '--port', PORT, '--plugins', __name__ + '.OntologyTimeMachinePlugin' ] + logger.info("Starting OntologyTimeMachineProxy server...") proxy.main() \ No newline at end of file diff --git a/ontologytimemachine/utils/dependency.ttl b/ontologytimemachine/utils/dependency.ttl new file mode 100644 index 0000000..d28bac7 --- /dev/null +++ b/ontologytimemachine/utils/dependency.ttl @@ -0,0 +1,25 @@ +@prefix ex-version: + owl:imports , ; + ex-version:current ; + ex-version:version + , + [ + ex-version:snapshot ; + ex-version:file ; + ex-version:dependency , ; + ] . + + + ex-version:snapshot ; + ex-version:file ; + ex-version:dependency , ; +] + + ex-version:snapshot ; + ex-version:file . + + ex-version:snapshot ; + ex-version:file . + + ex-version:snapshot ; + ex-version:file . diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 227b8eb..2435393 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -6,6 +6,7 @@ import logging import requests import argparse +import rdflib import mimetypes @@ -124,7 +125,7 @@ def get_ontology_from_request(request): if v[0].decode('utf-8') == 'Host': host = v[1].decode('utf-8') path = request.path.decode('utf-8') - ontology = 'https://' + host + request.path.decode('utf-8') + ontology = 'https://' + host + path else: host = request.host.decode('utf-8') path = request.path.decode('utf-8') @@ -168,7 +169,7 @@ def proxy_logic(request: HttpParser, ontoFormat, ontoVersion): response = fetch_original(ontology, headers) elif ontoVersion == 'originalFailoverLive': response = fetch_failover(ontology, headers, live=True) - elif ontoVersion == 'originalFailoverMonitor': + elif ontoVersion == 'originalFailoverArchivoontoVersionMonitor': response = fetch_failover(ontology, headers, monitor=True) elif ontoVersion == 'latestArchive': response = fetch_latest_archive(ontology, headers) @@ -233,7 +234,38 @@ def fetch_timestamp_archive(ontology, headers): def fetch_dependency_manifest(ontology, headers): - return mock_response_404 + dependencies_file = "ontologytimemachine/utils/dependency.ttl" + # Parse RDF data from the dependencies file + g = rdflib.Graph() + g.parse(dependencies_file, format="turtle") + + version_namespace = rdflib.Namespace("https://example.org/versioning/") + + # Extract dependencies related to the ontology link + ontology = rdflib.URIRef(ontology) + + dependencies = g.subjects(predicate=version_namespace.dependency, object=ontology) + + for dependency in dependencies: + dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) + dep_file = g.value(subject=dependency, predicate=version_namespace.file) + + # Make request to DBpedia archive API + base_api_url = "https://archivo.dbpedia.org/download" + + if dep_file: + version_param = dep_file.split('v=')[1] + api_url = f"{base_api_url}?o={ontology}&v={version_param}" + else: + api_url = f"{base_api_url}?o={ontology}" + + response = requests.get(api_url) + if response.status_code == 200: + logger.info(f"Successfully fetched {api_url}") + return response + else: + logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") + return mock_response_404 def failover_mode(request): diff --git a/poetry.lock b/poetry.lock index ce05dc1..5eb6572 100644 --- a/poetry.lock +++ b/poetry.lock @@ -157,6 +157,20 @@ files = [ {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = false +python-versions = "*" +files = [ + {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, + {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "packaging" version = "24.0" @@ -194,6 +208,20 @@ files = [ {file = "proxy_py-2.4.4.tar.gz", hash = "sha256:216581f70ad673f4ecb5f6b27f52491aaf1c056829f4a670f5ea3b5a340f4272"}, ] +[[package]] +name = "pyparsing" +version = "3.1.2" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ + {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, + {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + [[package]] name = "pytest" version = "8.2.2" @@ -216,6 +244,27 @@ tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "rdflib" +version = "7.0.0" +description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." +optional = false +python-versions = ">=3.8.1,<4.0.0" +files = [ + {file = "rdflib-7.0.0-py3-none-any.whl", hash = "sha256:0438920912a642c866a513de6fe8a0001bd86ef975057d6962c79ce4771687cd"}, + {file = "rdflib-7.0.0.tar.gz", hash = "sha256:9995eb8569428059b8c1affd26b25eac510d64f5043d9ce8c84e0d0036e995ae"}, +] + +[package.dependencies] +isodate = ">=0.6.0,<0.7.0" +pyparsing = ">=2.1.0,<4" + +[package.extras] +berkeleydb = ["berkeleydb (>=18.1.0,<19.0.0)"] +html = ["html5lib (>=1.0,<2.0)"] +lxml = ["lxml (>=4.3.0,<5.0.0)"] +networkx = ["networkx (>=2.0.0,<3.0.0)"] + [[package]] name = "requests" version = "2.32.3" @@ -237,6 +286,17 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "tomli" version = "2.0.1" @@ -268,4 +328,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "0ca0bb1508c8da08eb6aeac6255865d858133aef5b09b9b119872d880df76b60" +content-hash = "202968486827cf46664fd533592fac8667cb3bc2b20ed820d397b7f13243acf2" diff --git a/pyproject.toml b/pyproject.toml index 4c11494..9a287ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ python = "^3.10" pytest = "^8.2.1" requests = "^2.32.3" proxy-py = "^2.4.4" +rdflib = "^7.0.0" [build-system] diff --git a/tests/test_proxy.py b/tests/oldtest_integration.py similarity index 81% rename from tests/test_proxy.py rename to tests/oldtest_integration.py index 34dde2f..d5d8b8e 100644 --- a/tests/test_proxy.py +++ b/tests/oldtest_integration.py @@ -2,6 +2,7 @@ import requests import time import subprocess +import itertools from ontologytimemachine.custom_proxy import IP, PORT @@ -168,5 +169,25 @@ def iri_generic_test(iri): print(f"Request failed for IRI: {iri}, Error: {e}") +def get_parameter_combinations(): +# Define the possible values for each parameter + ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] + ontoPrecedence = ['default', 'enforcedPriority', 'always'] + patchAcceptUpstream = [True, False] + ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', + 'latestArchive', 'timestampArchive', 'dependencyManifest'] + onlyOntologies = [True, False] + httpsIntercept = [True, False] + inspectRedirects = [True, False] + forwardHeaders = [True, False] + subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] + + combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, + onlyOntologies, httpsIntercept, inspectRedirects, + forwardHeaders, subjectBinarySearchThreshold)) + return combinations + + if __name__ == '__main__': + pytest.main() From b65ba5b4b7bbd53f7249d07788690f037e267956 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 3 Sep 2024 02:11:00 +0200 Subject: [PATCH 02/35] fix proxy --- README.md | 3 ++ ontologytimemachine/custom_proxy.py | 15 +++++- ontologytimemachine/utils/utils.py | 73 ++++++++++++++--------------- tests/test_integration.py | 0 4 files changed, 51 insertions(+), 40 deletions(-) create mode 100644 tests/test_integration.py diff --git a/README.md b/README.md index 413afd3..65c33ca 100644 --- a/README.md +++ b/README.md @@ -35,3 +35,6 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem ### Not working: - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# + + +python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname IP --port 8899 --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin --ontoFormat ntriples --ontoVersion originalFailoverLive --ontoPrecedence enforcedPriority \ No newline at end of file diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index a8c40e8..474257c 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -15,6 +15,7 @@ IP = '0.0.0.0' PORT = '8899' +config = None logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @@ -24,7 +25,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) (self.ontoFormat, self.ontoVersion, self.only_ontologies, self.https_intercept, self.inspect_redirects, self.forward_headers, - self.subject_binary_search_threshold) = parse_arguments() + self.subject_binary_search_threshold) = config + logger.info(config) def before_upstream_connection(self, request: HttpParser): @@ -93,6 +95,15 @@ def queue_response(self, response): if __name__ == '__main__': + config = parse_arguments() + + + print('Cionfig') + print(config) + print(sys.argv) + + sys.argv = [sys.argv[0]] # TODO: fix this + sys.argv += [ '--ca-key-file', 'ca-key.pem', '--ca-cert-file', 'ca-cert.pem', @@ -104,5 +115,7 @@ def queue_response(self, response): '--plugins', __name__ + '.OntologyTimeMachinePlugin' ] + print(sys.argv) + logger.info("Starting OntologyTimeMachineProxy server...") proxy.main() \ No newline at end of file diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 2435393..04472cc 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -27,6 +27,7 @@ def parse_arguments(): parser = argparse.ArgumentParser(description='Process ontology format and version.') + # Defining ontoFormat argument with nested options parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') @@ -62,25 +63,6 @@ def parse_arguments(): parser.add_argument('--subjectBinarySearchThreshold', type=int, default=100, help='SubjectBinarySearchThreshold value.') - # Proxy native parameters - parser.add_argument('--ca-key-file', type=str, required=True, - help='Path to the CA key file.') - - parser.add_argument('--ca-cert-file', type=str, required=True, - help='Path to the CA certificate file.') - - parser.add_argument('--ca-signing-key-file', type=str, required=True, - help='Path to the CA signing key file.') - - parser.add_argument('--hostname', type=str, required=True, - help='Hostname for the proxy server.') - - parser.add_argument('--port', type=int, required=True, - help='Port for the proxy server.') - - parser.add_argument('--plugins', type=str, required=True, - help='Plugins for the proxy server.') - args = parser.parse_args() ontoFormat = { @@ -120,6 +102,10 @@ def get_headers(request): def get_ontology_from_request(request): logger.info('Get ontology from request') + print(f'Request protocol: {request.protocol}') + print(f'Request host: {request.host}') + print(f'Request _url: {request._url}') + print(f'Request path: {request.path}') if (request.method == b'GET' or request.method == b'HEAD') and not request.host: for k, v in request.headers.items(): if v[0].decode('utf-8') == 'Host': @@ -134,11 +120,17 @@ def get_ontology_from_request(request): return ontology, host, path -def get_mime_type(format): - # Guess the MIME type based on the format - mime_type, _ = mimetypes.guess_type(f'file.{format}') - # Return the guessed MIME type or a generic default if guessing fails - return mime_type or 'text/turtle' +def get_mime_type(format='turtle'): + # Define a mapping of formats to MIME types + format_to_mime = { + 'turtle': 'text/turtle', + 'ntriples': 'application/n-triples', + 'rdfxml': 'application/rdf+xml', + 'htmldocu': 'text/html' + } + + # Return the MIME type based on the format or use a generic default + return format_to_mime.get(format, 'text/turtle') def set_onto_format_headers(request, ontoFormat, ontoVersion): @@ -146,10 +138,14 @@ def set_onto_format_headers(request, ontoFormat, ontoVersion): # Determine the correct MIME type for the format mime_type = get_mime_type(ontoFormat['format']) + logger.info(f'Requested mimetype: {mime_type}') # Check the precedence and update the 'Accept' header if necessary - if ontoFormat['precedence'] in ['always', 'enforcedPriority'] or \ - (ontoFormat['precedence'] == 'default' and b'accept' not in request.headers): + if ontoFormat['precedence'] in ['always'] or \ + (ontoFormat['precedence'] == 'default' and request.headers[b'accept'][1] == b'*/*') or \ + request.headers[b'accept'][1] == b'*/*': + # Needed to make sure the accept header is define + # TODO: Clean up the conditions request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) logger.info(f'Accept header set to: {request.headers[b"accept"][1]}') @@ -199,7 +195,11 @@ def fetch_failover(ontology, headers, live=False, monitor=False): logger.info(f'Fetching original ontology with failover from URL: {ontology}') response = requests.get(url=ontology, headers=headers, timeout=5) logger.info('Successfully fetched original ontology') - if response.status_code in passthrough_status_codes_http: + requested_mime_type = headers.get('Accept', None) # Assuming you set the requested MIME type in the 'Accept' header + response_mime_type = response.headers.get('Content-Type', '').split(';')[0] + logger.info(f'Requested mimetype: {requested_mime_type}') + logger.info(f'Response mimetype: {response_mime_type}') + if response.status_code in passthrough_status_codes_http and requested_mime_type == response_mime_type: return response else: logging.info(f'Status code: {response.status_code}') @@ -299,6 +299,7 @@ def fetch_from_dbpedia_archivo_api(ontology, headers): try: logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') response = requests.get(dbpedia_url, timeout=5) + print(response) return response except requests.exceptions.RequestException as e: logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') @@ -306,21 +307,15 @@ def fetch_from_dbpedia_archivo_api(ontology, headers): def map_mime_to_format(mime_type): - # Use the mimetypes library to get the file extension - extension = mimetypes.guess_extension(mime_type) - if not extension: - return None - # Map file extensions to formats - ext_to_format = { - '.rdf': 'owl', - '.xml': 'owl', - '.ttl': 'ttl', - '.nt': 'nt', - # Add more mappings if needed + mime_to_format = { + 'application/rdf+xml': 'owl', # Common MIME type for OWL files + 'application/owl+xml': 'owl', # Specific MIME type for OWL + 'text/turtle': 'ttl', # MIME type for Turtle format + 'application/n-triples': 'nt', # MIME type for N-Triples format } - return ext_to_format.get(extension, None) + return mime_to_format.get(mime_type, None) def get_parameters_from_headers(headers): diff --git a/tests/test_integration.py b/tests/test_integration.py new file mode 100644 index 0000000..e69de29 From 4258b982bfce661f02312ef26f7e95d49f1e9bec Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 3 Sep 2024 02:14:07 +0200 Subject: [PATCH 03/35] add integration tests again --- tests/test_integration.py | 193 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) diff --git a/tests/test_integration.py b/tests/test_integration.py index e69de29..d5d8b8e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -0,0 +1,193 @@ +import pytest +import requests +import time +import subprocess +import itertools +from ontologytimemachine.custom_proxy import IP, PORT + + +PROXY = f'{IP}:{PORT}' +HTTP_PROXY = f'http://{PROXY}' +HTTPS_PROXY = f'http://{PROXY}' +PROXIES = { + "http": HTTP_PROXY, + "https": HTTPS_PROXY +} +CA_CERT_PATH = "ca-cert.pem" + + +@pytest.fixture(scope="module", autouse=True) +def start_proxy_server(): + # Start the proxy server in a subprocess + process = subprocess.Popen( + [ + 'python3', '-m', 'proxy', + '--ca-key-file', 'ca-key.pem', + '--ca-cert-file', 'ca-cert.pem', + '--ca-signing-key-file', 'ca-signing-key.pem', + '--hostname', IP, + '--port', PORT, + '--plugins', 'ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin' + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + + # Wait a bit to ensure the server starts + time.sleep(5) + + yield + "http://0.0.0.0:8899" + # Terminate the proxy server after tests + process.terminate() + process.wait() + + +def test_babelnet(): + iri = 'http://babelnet.org/rdf/' + generic_test(iri, 'text/turtle') + + +def test_bag_basisregistraties(): + iri = 'http://bag.basisregistraties.overheid.nl/def/bag' + generic_test(iri, 'text/turtle') + + +def test_bblfish(): + iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' + generic_test(iri, 'text/turtle') + + +def test_brk_basisregistraties(): + iri = 'http://brk.basisregistraties.overheid.nl/def/brk' + generic_test(iri, 'text/turtle') + + +def test_brt_basisregistraties(): + iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' + generic_test(iri, 'text/turtle') + + +def test_brt_basisregistraties_begrippenkader(): + iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' + generic_test(iri, 'text/turtle') + + +def test_buzzword(): + iri = 'http://buzzword.org.uk/rdf/personal-link-types#' + generic_test(iri, 'text/turtle') + + +def test_catalogus_professorum(): + iri = 'http://catalogus-professorum.org/cpm/2/' + generic_test(iri, 'text/turtle') + + +def test_data_gov(): + iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' + generic_test(iri, 'text/turtle') + + +def test_data_bigdatagrapes(): + iri = 'http://data.bigdatagrapes.eu/resource/ontology/' + generic_test(iri, 'text/turtle') + + +def test_data_europa_esco(): + iri = 'http://data.europa.eu/esco/flow' + generic_test(iri, 'text/turtle') + + +def test_data_globalchange(): + iri = 'http://data.globalchange.gov/gcis.owl' + generic_test(iri, 'text/turtle') + + +def test_data_ontotext(): + iri = 'http://data.ontotext.com/resource/leak/' + generic_test(iri, 'text/turtle') + + +def test_data_opendiscoveryspace(): + iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' + generic_test(iri, 'text/turtle') + + +def test_data_ordnancesurvey_50kGazetteer(): + iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' + generic_test(iri, 'text/turtle') + + +def test_data_ordnancesurvey_50kGazetteer(): + iri = 'http://dbpedia.org/ontology/Person' + generic_test(iri, 'text/turtle') + + +def test_linked_web_apis(): + iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' + generic_test(iri, 'text/turtle') + + +#def test_ontologi_es(): +# iri = 'http://ontologi.es/days#' +# generic_test(iri, 'text/turtle') + + +def test_https(): + iri = "https://www.w3id.org/simulation/ontology/" + generic_test(iri, 'text/plain; charset=utf-8') + + +def test_https(): + iri = "https://vocab.eccenca.com/auth/" + generic_test(iri, 'text/plain; charset=utf-8') + + +def not_test_all_iris(): + with open('tests/archivo_ontologies_test.txt', 'r') as file: + for line in file: + iri = line.strip() + if iri: # Ensure it's not an empty line + iri_generic_test(iri) + + +def generic_test(iri, content_type): + response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) + assert response.status_code == 200 + assert iri in response.content.decode('utf-8') + + +def iri_generic_test(iri): + try: + response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) + assert response.status_code == 200 + assert iri in response.content.decode('utf-8') + print(f"Test passed for IRI: {iri}") + except AssertionError: + print(f"Test failed for IRI: {iri}") + except requests.exceptions.RequestException as e: + print(f"Request failed for IRI: {iri}, Error: {e}") + + +def get_parameter_combinations(): +# Define the possible values for each parameter + ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] + ontoPrecedence = ['default', 'enforcedPriority', 'always'] + patchAcceptUpstream = [True, False] + ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', + 'latestArchive', 'timestampArchive', 'dependencyManifest'] + onlyOntologies = [True, False] + httpsIntercept = [True, False] + inspectRedirects = [True, False] + forwardHeaders = [True, False] + subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] + + combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, + onlyOntologies, httpsIntercept, inspectRedirects, + forwardHeaders, subjectBinarySearchThreshold)) + return combinations + + +if __name__ == '__main__': + + pytest.main() From bb8bbda91b782a920bf76d3e993ebdba7b88a22e Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 3 Sep 2024 02:30:19 +0200 Subject: [PATCH 04/35] fixing testcases --- tests/test_integration.py | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index d5d8b8e..3eb4f39 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -21,13 +21,7 @@ def start_proxy_server(): # Start the proxy server in a subprocess process = subprocess.Popen( [ - 'python3', '-m', 'proxy', - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', - '--hostname', IP, - '--port', PORT, - '--plugins', 'ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin' + 'python3', 'ontologytimemachine/custom_proxy.py', ], stdout=subprocess.PIPE, stderr=subprocess.PIPE @@ -128,29 +122,6 @@ def test_linked_web_apis(): generic_test(iri, 'text/turtle') -#def test_ontologi_es(): -# iri = 'http://ontologi.es/days#' -# generic_test(iri, 'text/turtle') - - -def test_https(): - iri = "https://www.w3id.org/simulation/ontology/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def test_https(): - iri = "https://vocab.eccenca.com/auth/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def not_test_all_iris(): - with open('tests/archivo_ontologies_test.txt', 'r') as file: - for line in file: - iri = line.strip() - if iri: # Ensure it's not an empty line - iri_generic_test(iri) - - def generic_test(iri, content_type): response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 From 705a926de4274e754ef8821c7d683db24082cf7c Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 3 Sep 2024 02:36:07 +0200 Subject: [PATCH 05/35] remove 2 testcases --- tests/test_integration.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 3eb4f39..356dd2e 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -102,16 +102,6 @@ def test_data_ontotext(): generic_test(iri, 'text/turtle') -def test_data_opendiscoveryspace(): - iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' - generic_test(iri, 'text/turtle') - - -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' - generic_test(iri, 'text/turtle') - - def test_data_ordnancesurvey_50kGazetteer(): iri = 'http://dbpedia.org/ontology/Person' generic_test(iri, 'text/turtle') From 6f1138b05ff4208c0c78d85309f7bd40995dc29d Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 3 Sep 2024 02:49:56 +0200 Subject: [PATCH 06/35] keep 2 integration testcases --- tests/test_integration.py | 78 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 356dd2e..10630e7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -37,69 +37,69 @@ def start_proxy_server(): process.wait() -def test_babelnet(): - iri = 'http://babelnet.org/rdf/' - generic_test(iri, 'text/turtle') +# def test_babelnet(): +# iri = 'http://babelnet.org/rdf/' +# generic_test(iri, 'text/turtle') -def test_bag_basisregistraties(): - iri = 'http://bag.basisregistraties.overheid.nl/def/bag' - generic_test(iri, 'text/turtle') +# def test_bag_basisregistraties(): +# iri = 'http://bag.basisregistraties.overheid.nl/def/bag' +# generic_test(iri, 'text/turtle') -def test_bblfish(): - iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' - generic_test(iri, 'text/turtle') +# def test_bblfish(): +# iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' +# generic_test(iri, 'text/turtle') -def test_brk_basisregistraties(): - iri = 'http://brk.basisregistraties.overheid.nl/def/brk' - generic_test(iri, 'text/turtle') +# def test_brk_basisregistraties(): +# iri = 'http://brk.basisregistraties.overheid.nl/def/brk' +# generic_test(iri, 'text/turtle') -def test_brt_basisregistraties(): - iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' - generic_test(iri, 'text/turtle') +# def test_brt_basisregistraties(): +# iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' +# generic_test(iri, 'text/turtle') -def test_brt_basisregistraties_begrippenkader(): - iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' - generic_test(iri, 'text/turtle') +# def test_brt_basisregistraties_begrippenkader(): +# iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' +# generic_test(iri, 'text/turtle') -def test_buzzword(): - iri = 'http://buzzword.org.uk/rdf/personal-link-types#' - generic_test(iri, 'text/turtle') +# def test_buzzword(): +# iri = 'http://buzzword.org.uk/rdf/personal-link-types#' +# generic_test(iri, 'text/turtle') -def test_catalogus_professorum(): - iri = 'http://catalogus-professorum.org/cpm/2/' - generic_test(iri, 'text/turtle') +# def test_catalogus_professorum(): +# iri = 'http://catalogus-professorum.org/cpm/2/' +# generic_test(iri, 'text/turtle') -def test_data_gov(): - iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' - generic_test(iri, 'text/turtle') +# def test_data_gov(): +# iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' +# generic_test(iri, 'text/turtle') -def test_data_bigdatagrapes(): - iri = 'http://data.bigdatagrapes.eu/resource/ontology/' - generic_test(iri, 'text/turtle') +# def test_data_bigdatagrapes(): +# iri = 'http://data.bigdatagrapes.eu/resource/ontology/' +# generic_test(iri, 'text/turtle') -def test_data_europa_esco(): - iri = 'http://data.europa.eu/esco/flow' - generic_test(iri, 'text/turtle') +# def test_data_europa_esco(): +# iri = 'http://data.europa.eu/esco/flow' +# generic_test(iri, 'text/turtle') -def test_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') +# def test_data_globalchange(): +# iri = 'http://data.globalchange.gov/gcis.owl' +# generic_test(iri, 'text/turtle') -def test_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') +# def test_data_ontotext(): +# iri = 'http://data.ontotext.com/resource/leak/' +# generic_test(iri, 'text/turtle') def test_data_ordnancesurvey_50kGazetteer(): From adfe2c564d3c62b3c51d3bdd9b87aa2451cd8343 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Sat, 7 Sep 2024 20:45:08 +0200 Subject: [PATCH 07/35] finalize the wrapper and restructure the code --- ontologytimemachine/custom_proxy.py | 50 ++--- ontologytimemachine/proxy_wrapper.py | 101 +++++++++ ontologytimemachine/utils/proxy_logic.py | 166 +++++++++++++++ ontologytimemachine/utils/utils.py | 260 ++++------------------- tests/test_integration.py | 12 +- tests/test_unit.py | 15 +- 6 files changed, 345 insertions(+), 259 deletions(-) create mode 100644 ontologytimemachine/proxy_wrapper.py create mode 100644 ontologytimemachine/utils/proxy_logic.py diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 474257c..4333442 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -1,11 +1,12 @@ from proxy.http.proxy import HttpProxyBasePlugin -from proxy.http.parser import HttpParser, httpParserTypes +from proxy.http.parser import HttpParser from proxy.common.utils import build_http_response -from proxy.http.methods import HttpMethods -from ontologytimemachine.utils.utils import proxy_logic, parse_arguments -from ontologytimemachine.utils.utils import check_if_archivo_ontology_requested +from ontologytimemachine.utils.utils import parse_arguments from ontologytimemachine.utils.mock_responses import mock_response_403 -from requests.exceptions import SSLError, Timeout, ConnectionError, RequestException +from ontologytimemachine.proxy_wrapper import HttpRequestWrapper +from ontologytimemachine.utils.proxy_logic import proxy_logic, is_ontology_request_only_ontology +from ontologytimemachine.utils.proxy_logic import is_archivo_ontology_request +from ontologytimemachine.utils.proxy_logic import if_intercept_host from http.client import responses import proxy import sys @@ -24,62 +25,60 @@ class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) (self.ontoFormat, self.ontoVersion, self.only_ontologies, - self.https_intercept, self.inspect_redirects, self.forward_headers, - self.subject_binary_search_threshold) = config + self.https_intercept, self.inspect_redirects, + self.forward_headers) = config logger.info(config) - def before_upstream_connection(self, request: HttpParser): logger.info('Before upstream connection hook') logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + wrapped_request = HttpRequestWrapper(request) - if request.method == b'CONNECT': + if wrapped_request.is_connect_request(): logger.info(f'HTTPS interception mode: {self.https_intercept}') # Only intercept if interception is enabled - if self.https_intercept in ['all', 'archivo']: + # Move this to the utils + if if_intercept_host(self.https_intercept): + logger.info('HTTPS interception is on, forwardig the request') return request else: + logger.info('HTTPS interception is turned off') return None - - ontology_request = check_if_archivo_ontology_requested(request) # If only ontology mode, return None in all other cases - if self.only_ontologies and not ontology_request: + if is_ontology_request_only_ontology(wrapped_request, self.only_ontologies): logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') self.queue_response(mock_response_403) return None - if ontology_request: + if is_archivo_ontology_request(wrapped_request): logger.debug('The request is for an ontology') - response = proxy_logic(request, self.ontoFormat, self.ontoVersion) + response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion) self.queue_response(response) return None return request - def handle_client_request(self, request: HttpParser): logger.info('Handle client request hook') logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') - logger.debug(request.method) - if request.method == b'CONNECT': + wrapped_request = HttpRequestWrapper(request) + if wrapped_request.is_connect_request(): return request - ontology_request = check_if_archivo_ontology_requested(request) - if not ontology_request: + is_ontology_request = is_archivo_ontology_request(wrapped_request) + if not is_ontology_request: logger.info('The requested IRI is not part of DBpedia Archivo') return request - response = proxy_logic(request, self.ontoFormat, self.ontoVersion) + response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion) self.queue_response(response) return None - def handle_upstream_chunk(self, chunk: memoryview): return chunk - def queue_response(self, response): self.client.queue( build_http_response( @@ -96,11 +95,6 @@ def queue_response(self, response): if __name__ == '__main__': config = parse_arguments() - - - print('Cionfig') - print(config) - print(sys.argv) sys.argv = [sys.argv[0]] # TODO: fix this diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py new file mode 100644 index 0000000..fe5d3db --- /dev/null +++ b/ontologytimemachine/proxy_wrapper.py @@ -0,0 +1,101 @@ +from abc import ABC, abstractmethod +from proxy.http.parser import HttpParser +import logging + + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +class AbstractRequestWrapper(ABC): + def __init__(self, request): + self.request = request + + @abstractmethod + def is_get_request(self) -> bool: + pass + + @abstractmethod + def is_connect_request(self) -> bool: + pass + + @abstractmethod + def is_head_request(self) -> bool: + pass + + @abstractmethod + def is_https_request(self) -> bool: + pass + + @abstractmethod + def get_request(self): + pass + + @abstractmethod + def get_request_headers(self): + pass + + @abstractmethod + def get_request_accept_header(self): + pass + + @abstractmethod + def set_request_accept_header(self, mime_type): + pass + + @abstractmethod + def get_ontology_from_request(self): + pass + + +class HttpRequestWrapper(AbstractRequestWrapper): + def __init__(self, request: HttpParser): + super().__init__(request) + + def is_get_request(self) -> bool: + return self.request.method == b'GET' + + def is_connect_request(self): + return self.request.method == b'CONNECT' + + def is_head_request(self): + return self.request.method == b'HEAD' + + def is_https_request(self): + return self.request.method == b'CONNECT' or self.request.headers.get(b'Host', b'').startswith(b'https') + + def get_request(self): + return self.request + + def get_request_headers(self): + headers = {} + for k, v in self.request.headers.items(): + headers[v[0].decode('utf-8')] = v[1].decode('utf-8') + return headers + + def get_request_accept_header(self): + logger.info('Wrapper - get_request_accept_header') + return self.request.headers[b'accept'][1].decode('utf-8') + + def set_request_accept_header(self, mime_type): + self.request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) + logger.info(f'Accept header set to: {self.request.headers[b"accept"][1]}') + + def get_ontology_from_request(self): + logger.info('Get ontology from request') + print(f'Request protocol: {self.request.protocol}') + print(f'Request host: {self.request.host}') + print(f'Request _url: {self.request._url}') + print(f'Request path: {self.request.path}') + if (self.request.method == b'GET' or self.request.method == b'HEAD') and not self.request.host: + for k, v in self.request.headers.items(): + if v[0].decode('utf-8') == 'Host': + host = v[1].decode('utf-8') + path = self.request.path.decode('utf-8') + ontology = 'https://' + host + path + else: + host = self.request.host.decode('utf-8') + path = self.request.path.decode('utf-8') + ontology = str(self.request._url) + logger.info(f'Ontology: {ontology}') + return ontology, host, path \ No newline at end of file diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py new file mode 100644 index 0000000..0836ce2 --- /dev/null +++ b/ontologytimemachine/utils/proxy_logic.py @@ -0,0 +1,166 @@ +import logging +import requests +import rdflib +from urllib.parse import urlparse + +from ontologytimemachine.utils.utils import set_onto_format_headers, get_parameters_from_headers +from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes_http +from ontologytimemachine.utils.mock_responses import mock_response_500 +from ontologytimemachine.utils.mock_responses import mock_response_404 + + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def if_intercept_host(https_intercept): + if https_intercept in ['all']: + return True + return False + + +def is_ontology_request_only_ontology(wrapped_request, only_ontologies): + is_archivo_ontology = is_archivo_ontology_request(wrapped_request) + if only_ontologies and not is_archivo_ontology: + return True + return False + + +def is_archivo_ontology_request(wrapped_request): + with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: + urls = [line.strip() for line in file] + parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] + + _, request_host, request_path = wrapped_request.get_ontology_from_request() + for host, path in parsed_urls: + if request_host == host and request_path.startswith(path): + return True + return False + + +def proxy_logic(wrapped_request, ontoFormat, ontoVersion): + logger.info('Proxy has to intervene') + + set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) + + headers = wrapped_request.get_request_headers() + logger.info(f'Updated headers: {headers}') + ontology, _, _ = wrapped_request.get_ontology_from_request() + if ontoVersion == 'original': + response = fetch_original(ontology, headers) + elif ontoVersion == 'originalFailoverLive': + response = fetch_failover(ontology, headers, live=True) + elif ontoVersion == 'originalFailoverArchivoontoVersionMonitor': + response = fetch_failover(ontology, headers, monitor=True) + elif ontoVersion == 'latestArchive': + response = fetch_latest_archive(ontology, headers) + elif ontoVersion == 'timestampArchive': + response = fetch_timestamp_archive(ontology, headers) + elif ontoVersion == 'dependencyManifest': + response = fetch_dependency_manifest(ontology, headers) + + return response + + +# Fetch from the original source, no matter what +def fetch_original(ontology, headers): + logger.info(f'Fetching original ontology from URL: {ontology}') + try: + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched original ontology') + return response + except Exception as e: + logger.error(f'Error fetching original ontology: {e}') + return mock_response_500() + + +# Failover mode +def fetch_failover(ontology, headers, live=False, monitor=False): + try: + logger.info(f'Fetching original ontology with failover from URL: {ontology}') + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched original ontology') + requested_mime_type = headers.get('Accept', None) # Assuming you set the requested MIME type in the 'Accept' header + response_mime_type = response.headers.get('Content-Type', '').split(';')[0] + logger.info(f'Requested mimetype: {requested_mime_type}') + logger.info(f'Response mimetype: {response_mime_type}') + if response.status_code in passthrough_status_codes_http and requested_mime_type == response_mime_type: + return response + else: + logging.info(f'Status code: {response.status_code}') + return fetch_from_dbpedia_archivo_api(ontology, headers) + except Exception as e: + logger.error(f'Error fetching original ontology: {e}') + if live: + logger.info('Attempting to fetch live version due to failover') + return fetch_from_dbpedia_archivo_api(ontology, headers) + elif monitor: + logger.info('Attempting to fetch archive monitor version due to failover') + # TODO + return mock_response_404 + else: + return mock_response_500 + + +def fetch_from_dbpedia_archivo_api(ontology, headers): + format, version, versionMatching = get_parameters_from_headers(headers) + dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' + try: + logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + response = requests.get(dbpedia_url, timeout=5) + print(response) + return response + except requests.exceptions.RequestException as e: + logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') + return mock_response_404() + + +# Fetch the lates version from archivo (no timestamp defined) +def fetch_latest_archive(ontology, headers): + logger.info(f'Fetching latest archive ontology from URL: {ontology}/latest') + try: + response = requests.get(url=ontology, headers=headers, timeout=5) + logger.info('Successfully fetched latest archive ontology') + return response + except Exception as e: + logger.error(f'Error fetching latest archive ontology: {e}') + return mock_response_500 + + +def fetch_timestamp_archive(ontology, headers): + return mock_response_404 + + +def fetch_dependency_manifest(ontology, headers): + dependencies_file = "ontologytimemachine/utils/dependency.ttl" + # Parse RDF data from the dependencies file + g = rdflib.Graph() + g.parse(dependencies_file, format="turtle") + + version_namespace = rdflib.Namespace("https://example.org/versioning/") + + # Extract dependencies related to the ontology link + ontology = rdflib.URIRef(ontology) + + dependencies = g.subjects(predicate=version_namespace.dependency, object=ontology) + + for dependency in dependencies: + dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) + dep_file = g.value(subject=dependency, predicate=version_namespace.file) + + # Make request to DBpedia archive API + base_api_url = "https://archivo.dbpedia.org/download" + + if dep_file: + version_param = dep_file.split('v=')[1] + api_url = f"{base_api_url}?o={ontology}&v={version_param}" + else: + api_url = f"{base_api_url}?o={ontology}" + + response = requests.get(api_url) + if response.status_code == 200: + logger.info(f"Successfully fetched {api_url}") + return response + else: + logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") + return mock_response_404 \ No newline at end of file diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 04472cc..12786ee 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -1,12 +1,5 @@ -from proxy.http.parser import HttpParser, httpParserTypes -from requests.exceptions import SSLError, Timeout, ConnectionError, RequestException -from ontologytimemachine.utils.mock_responses import mock_response_403, mock_response_404, mock_response_500, mock_response_200 -from http.client import responses -from urllib.parse import urlparse import logging -import requests import argparse -import rdflib import mimetypes @@ -24,10 +17,10 @@ 451, ] + def parse_arguments(): parser = argparse.ArgumentParser(description='Process ontology format and version.') - # Defining ontoFormat argument with nested options parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') @@ -59,10 +52,6 @@ def parse_arguments(): parser.add_argument('--forwardHeaders', type=bool, default=True, help='Enable/disable proxy forward headers.') - # SubjectBinarySearchThreshold - parser.add_argument('--subjectBinarySearchThreshold', type=int, default=100, - help='SubjectBinarySearchThreshold value.') - args = parser.parse_args() ontoFormat = { @@ -73,51 +62,11 @@ def parse_arguments(): logger.info(f'Ontology Format: {ontoFormat}') logger.info(f'Ontology Version: {args.ontoVersion}') - #logger.info(f'Only Ontologies Mode: {args.onlyOntologies}') - #logger.info(f'HTTPS Interception: {args.httpsIntercept}') - #logger.info(f'Inspect Redirects: {args.inspectRedirects}') - #logger.info(f'Forward Headers: {args.forwardHeaders}') - #logger.info(f'Subject Binary Search Threshold: {args.subjectBinarySearchThreshold}') - return ontoFormat, args.ontoVersion, args.onlyOntologies, args.httpsIntercept, args.inspectRedirects, args.forwardHeaders, args.subjectBinarySearchThreshold - - -def check_if_archivo_ontology_requested(request): - with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: - urls = [line.strip() for line in file] - parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] - - _, request_host, request_path = get_ontology_from_request(request) - for host, path in parsed_urls: - if request_host == host and request_path.startswith(path): - return True - return False - - -def get_headers(request): - headers = {} - for k, v in request.headers.items(): - headers[v[0].decode('utf-8')] = v[1].decode('utf-8') - return headers - - -def get_ontology_from_request(request): - logger.info('Get ontology from request') - print(f'Request protocol: {request.protocol}') - print(f'Request host: {request.host}') - print(f'Request _url: {request._url}') - print(f'Request path: {request.path}') - if (request.method == b'GET' or request.method == b'HEAD') and not request.host: - for k, v in request.headers.items(): - if v[0].decode('utf-8') == 'Host': - host = v[1].decode('utf-8') - path = request.path.decode('utf-8') - ontology = 'https://' + host + path - else: - host = request.host.decode('utf-8') - path = request.path.decode('utf-8') - ontology = str(request._url) - logger.info(f'Ontology: {ontology}') - return ontology, host, path + logger.info(f'Only Ontologies Mode: {args.onlyOntologies}') + logger.info(f'HTTPS Interception: {args.httpsIntercept}') + logger.info(f'Inspect Redirects: {args.inspectRedirects}') + logger.info(f'Forward Headers: {args.forwardHeaders}') + return ontoFormat, args.ontoVersion, args.onlyOntologies, args.httpsIntercept, args.inspectRedirects, args.forwardHeaders def get_mime_type(format='turtle'): @@ -133,178 +82,55 @@ def get_mime_type(format='turtle'): return format_to_mime.get(format, 'text/turtle') -def set_onto_format_headers(request, ontoFormat, ontoVersion): +def set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion): logger.info(f'Setting headers based on ontoFormat: {ontoFormat}') # Determine the correct MIME type for the format mime_type = get_mime_type(ontoFormat['format']) logger.info(f'Requested mimetype: {mime_type}') + logger.info(f'Wrapper isconnect: {wrapped_request.is_connect_request()}') + + request_accept_header = wrapped_request.get_request_accept_header() + # Check the precedence and update the 'Accept' header if necessary + # Redefine the condition if ontoFormat['precedence'] in ['always'] or \ - (ontoFormat['precedence'] == 'default' and request.headers[b'accept'][1] == b'*/*') or \ - request.headers[b'accept'][1] == b'*/*': + (ontoFormat['precedence'] == 'default' and request_accept_header == '*/*') or \ + request_accept_header == '*/*': # Needed to make sure the accept header is define - # TODO: Clean up the conditions - request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) - logger.info(f'Accept header set to: {request.headers[b"accept"][1]}') + wrapped_request.set_request_accept_header(mime_type) # Check if patchAcceptUpstream is true and ontoVersion is 'original' if ontoFormat['patchAcceptUpstream'] and ontoVersion == 'original': - request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) - logger.info(f'Accept header patched upstream: {request.headers[b"accept"][1]}') - - -def proxy_logic(request: HttpParser, ontoFormat, ontoVersion): - logger.info('Proxy has to intervene') - set_onto_format_headers(request, ontoFormat, ontoVersion) - headers = get_headers(request) - logger.info(f'Updated headers: {request.headers}') - ontology, _, _ = get_ontology_from_request(request) - if ontoVersion == 'original': - response = fetch_original(ontology, headers) - elif ontoVersion == 'originalFailoverLive': - response = fetch_failover(ontology, headers, live=True) - elif ontoVersion == 'originalFailoverArchivoontoVersionMonitor': - response = fetch_failover(ontology, headers, monitor=True) - elif ontoVersion == 'latestArchive': - response = fetch_latest_archive(ontology, headers) - elif ontoVersion == 'timestampArchive': - response = fetch_timestamp_archive(ontology, headers) - elif ontoVersion == 'dependencyManifest': - response = fetch_dependency_manifest(ontology, headers) - - return response - - -# Fetch from the original source, no matter what -def fetch_original(ontology, headers): - logger.info(f'Fetching original ontology from URL: {ontology}') - try: - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched original ontology') - return response - except Exception as e: - logger.error(f'Error fetching original ontology: {e}') - return mock_response_500() - - -# Failover mode -def fetch_failover(ontology, headers, live=False, monitor=False): - try: - logger.info(f'Fetching original ontology with failover from URL: {ontology}') - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched original ontology') - requested_mime_type = headers.get('Accept', None) # Assuming you set the requested MIME type in the 'Accept' header - response_mime_type = response.headers.get('Content-Type', '').split(';')[0] - logger.info(f'Requested mimetype: {requested_mime_type}') - logger.info(f'Response mimetype: {response_mime_type}') - if response.status_code in passthrough_status_codes_http and requested_mime_type == response_mime_type: - return response - else: - logging.info(f'Status code: {response.status_code}') - return fetch_from_dbpedia_archivo_api(ontology, headers) - except Exception as e: - logger.error(f'Error fetching original ontology: {e}') - if live: - logger.info('Attempting to fetch live version due to failover') - return fetch_from_dbpedia_archivo_api(ontology, headers) - elif monitor: - logger.info('Attempting to fetch archive monitor version due to failover') - # TODO - return mock_response_404 - else: - return mock_response_500 - - -# Fetch the lates version from archivo (no timestamp defined) -def fetch_latest_archive(ontology, headers): - logger.info(f'Fetching latest archive ontology from URL: {ontology}/latest') - try: - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched latest archive ontology') - return response - except Exception as e: - logger.error(f'Error fetching latest archive ontology: {e}') - return mock_response_500 - - -def fetch_timestamp_archive(ontology, headers): - return mock_response_404 - - -def fetch_dependency_manifest(ontology, headers): - dependencies_file = "ontologytimemachine/utils/dependency.ttl" - # Parse RDF data from the dependencies file - g = rdflib.Graph() - g.parse(dependencies_file, format="turtle") - - version_namespace = rdflib.Namespace("https://example.org/versioning/") - - # Extract dependencies related to the ontology link - ontology = rdflib.URIRef(ontology) - - dependencies = g.subjects(predicate=version_namespace.dependency, object=ontology) - - for dependency in dependencies: - dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) - dep_file = g.value(subject=dependency, predicate=version_namespace.file) - - # Make request to DBpedia archive API - base_api_url = "https://archivo.dbpedia.org/download" - - if dep_file: - version_param = dep_file.split('v=')[1] - api_url = f"{base_api_url}?o={ontology}&v={version_param}" - else: - api_url = f"{base_api_url}?o={ontology}" - - response = requests.get(api_url) - if response.status_code == 200: - logger.info(f"Successfully fetched {api_url}") - return response - else: - logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") - return mock_response_404 - - -def failover_mode(request): - headers = get_headers(request) - logger.info('Failover mode') - - ontology, _, _ = get_ontology_from_request(request) - try: - response = requests.get(url=ontology, headers=headers, timeout=5) - if response.history: - logger.debug("Request was redirected") - for resp in response.history: - logger.debug(f"{resp.status_code}, {resp.url}") - logger.debug(f"Final destination: {response.status_code}, {response.url}") - else: - logger.debug("Request was not redirected") - content_type = response.headers.get('Content-Type') - logger.debug(content_type) - if response.status_code in passthrough_status_codes_http: - return response - else: - logging.info(f'Status code: {response.status_code}') - return fetch_from_dbpedia_archivo_api(ontology, headers) - except (SSLError, Timeout, ConnectionError, RequestException) as e: - return fetch_from_dbpedia_archivo_api(ontology, headers) - - -def fetch_from_dbpedia_archivo_api(ontology, headers): - format, version, versionMatching = get_parameters_from_headers(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' - try: - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') - response = requests.get(dbpedia_url, timeout=5) - print(response) - return response - except requests.exceptions.RequestException as e: - logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') - return mock_response_404() - + wrapped_request.set_request_accept_header(mime_type) + + + +# def failover_mode(request): +# headers = get_headers(request) +# logger.info('Failover mode') + +# ontology, _, _ = get_ontology_from_request(request) +# try: +# response = requests.get(url=ontology, headers=headers, timeout=5) +# if response.history: +# logger.debug("Request was redirected") +# for resp in response.history: +# logger.debug(f"{resp.status_code}, {resp.url}") +# logger.debug(f"Final destination: {response.status_code}, {response.url}") +# else: +# logger.debug("Request was not redirected") +# content_type = response.headers.get('Content-Type') +# logger.debug(content_type) +# if response.status_code in passthrough_status_codes_http: +# return response +# else: +# logging.info(f'Status code: {response.status_code}') +# return fetch_from_dbpedia_archivo_api(ontology, headers) +# except (SSLError, Timeout, ConnectionError, RequestException) as e: +# return fetch_from_dbpedia_archivo_api(ontology, headers) + def map_mime_to_format(mime_type): # Map file extensions to formats diff --git a/tests/test_integration.py b/tests/test_integration.py index 10630e7..2236854 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -97,14 +97,14 @@ def start_proxy_server(): # generic_test(iri, 'text/turtle') -# def test_data_ontotext(): -# iri = 'http://data.ontotext.com/resource/leak/' -# generic_test(iri, 'text/turtle') +def test_data_ontotext(): + iri = 'http://data.ontotext.com/resource/leak/' + generic_test(iri, 'text/turtle') -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://dbpedia.org/ontology/Person' - generic_test(iri, 'text/turtle') +# def test_data_ordnancesurvey_50kGazetteer(): +# iri = 'http://dbpedia.org/ontology/Person' +# generic_test(iri, 'text/turtle') def test_linked_web_apis(): diff --git a/tests/test_unit.py b/tests/test_unit.py index f0f76e8..578f618 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -2,6 +2,7 @@ from unittest.mock import patch, Mock import argparse import requests + from ontologytimemachine.utils.mock_responses import ( mock_response_200, mock_response_403, @@ -9,12 +10,14 @@ mock_response_500 ) from ontologytimemachine.utils.utils import ( - parse_arguments, - fetch_from_dbpedia_archivo_api, + parse_arguments, map_mime_to_format, get_parameters_from_headers ) +from ontologytimemachine.utils.proxy_logic import ( + fetch_from_dbpedia_archivo_api +) class TestUtils(unittest.TestCase): @@ -28,8 +31,7 @@ def test_parse_arguments(self, mock_parse_args): onlyOntologies=True, httpsIntercept=False, inspectRedirects=True, - forwardHeaders=True, - subjectBinarySearchThreshold=100 + forwardHeaders=True ) args = parse_arguments() @@ -42,7 +44,6 @@ def test_parse_arguments(self, mock_parse_args): self.assertFalse(args[3]) self.assertTrue(args[4]) self.assertTrue(args[5]) - self.assertEqual(args[6], 100) mock_parse_args.return_value = argparse.Namespace( ontoFormat='ntriples', @@ -52,8 +53,7 @@ def test_parse_arguments(self, mock_parse_args): onlyOntologies=False, httpsIntercept=True, inspectRedirects=False, - forwardHeaders=False, - subjectBinarySearchThreshold=50 + forwardHeaders=False ) args = parse_arguments() @@ -66,7 +66,6 @@ def test_parse_arguments(self, mock_parse_args): self.assertTrue(args[3]) self.assertFalse(args[4]) self.assertFalse(args[5]) - self.assertEqual(args[6], 50) @patch('requests.get') From a302bf411296e144be84c9ec3295df84aa224e10 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Sun, 8 Sep 2024 16:22:18 +0200 Subject: [PATCH 08/35] implement the functionalities discussed during the last meeting --- ontologytimemachine/custom_proxy.py | 16 +- ontologytimemachine/utils/proxy_logic.py | 184 +++++++++++------------ ontologytimemachine/utils/utils.py | 153 ++++++++++++------- poetry.lock | 88 ++++++++++- pyproject.toml | 1 + tests/test_unit.py | 52 +++---- 6 files changed, 302 insertions(+), 192 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 4333442..0e74b4e 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -24,9 +24,9 @@ class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - (self.ontoFormat, self.ontoVersion, self.only_ontologies, - self.https_intercept, self.inspect_redirects, - self.forward_headers) = config + (self.ontoFormat, self.ontoVersion, self.restrictedAccess, + self.httpsInterception, self.disableRemovingRedirects, + self.forward_headers, self.timestamp, self.manifest) = config logger.info(config) def before_upstream_connection(self, request: HttpParser): @@ -35,10 +35,10 @@ def before_upstream_connection(self, request: HttpParser): wrapped_request = HttpRequestWrapper(request) if wrapped_request.is_connect_request(): - logger.info(f'HTTPS interception mode: {self.https_intercept}') + logger.info(f'HTTPS interception mode: {self.httpsInterception}') # Only intercept if interception is enabled # Move this to the utils - if if_intercept_host(self.https_intercept): + if if_intercept_host(self.httpsInterception): logger.info('HTTPS interception is on, forwardig the request') return request else: @@ -46,14 +46,14 @@ def before_upstream_connection(self, request: HttpParser): return None # If only ontology mode, return None in all other cases - if is_ontology_request_only_ontology(wrapped_request, self.only_ontologies): + if is_ontology_request_only_ontology(wrapped_request, self.restrictedAccess): logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') self.queue_response(mock_response_403) return None if is_archivo_ontology_request(wrapped_request): logger.debug('The request is for an ontology') - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion) + response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) self.queue_response(response) return None return request @@ -71,7 +71,7 @@ def handle_client_request(self, request: HttpParser): logger.info('The requested IRI is not part of DBpedia Archivo') return request - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion) + response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) self.queue_response(response) return None diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 0836ce2..0837dba 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -3,8 +3,9 @@ import rdflib from urllib.parse import urlparse -from ontologytimemachine.utils.utils import set_onto_format_headers, get_parameters_from_headers -from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes_http +from ontologytimemachine.utils.utils import set_onto_format_headers, get_format_from_accept_header +from ontologytimemachine.utils.utils import parse_accept_header_with_priority +from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes from ontologytimemachine.utils.mock_responses import mock_response_500 from ontologytimemachine.utils.mock_responses import mock_response_404 @@ -27,6 +28,7 @@ def is_ontology_request_only_ontology(wrapped_request, only_ontologies): def is_archivo_ontology_request(wrapped_request): + logger.info('Chekc if the requested ontology is in archivo') with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: urls = [line.strip() for line in file] parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] @@ -38,129 +40,119 @@ def is_archivo_ontology_request(wrapped_request): return False -def proxy_logic(wrapped_request, ontoFormat, ontoVersion): +def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): + allow_redirects = not disableRemovingRedirects + try: + response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=5) + logger.info('Successfully fetched original ontology') + return response + except Exception as e: + logger.error(f'Error fetching original ontology: {e}') + return mock_response_404() + + +def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp, manifest): logger.info('Proxy has to intervene') set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) headers = wrapped_request.get_request_headers() - logger.info(f'Updated headers: {headers}') ontology, _, _ = wrapped_request.get_ontology_from_request() + + # if the requested format is not in Archivo and the ontoVersion is not original + # we can stop because the archivo request will not go through + format = get_format_from_accept_header(headers) + if not format and ontoVersion != 'original': + logger.info(f'No format can be used from Archivo') + return mock_response_500 + if ontoVersion == 'original': - response = fetch_original(ontology, headers) - elif ontoVersion == 'originalFailoverLive': - response = fetch_failover(ontology, headers, live=True) - elif ontoVersion == 'originalFailoverArchivoontoVersionMonitor': - response = fetch_failover(ontology, headers, monitor=True) - elif ontoVersion == 'latestArchive': - response = fetch_latest_archive(ontology, headers) - elif ontoVersion == 'timestampArchive': - response = fetch_timestamp_archive(ontology, headers) + response = fetch_original(ontology, headers, disableRemovingRedirects) + elif ontoVersion == 'originalFailoverLiveLatest': + response = fetch_failover(ontology, headers, disableRemovingRedirects) + elif ontoVersion == 'latestArchived': + response = fetch_latest_archived(ontology, headers) + elif ontoVersion == 'timestampArchived': + response = fetch_timestamp_archived(ontology, headers, timestamp) elif ontoVersion == 'dependencyManifest': - response = fetch_dependency_manifest(ontology, headers) + response = fetch_dependency_manifest(ontology, headers, manifest) return response # Fetch from the original source, no matter what -def fetch_original(ontology, headers): +def fetch_original(ontology, headers, disableRemovingRedirects): logger.info(f'Fetching original ontology from URL: {ontology}') - try: - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched original ontology') - return response - except Exception as e: - logger.error(f'Error fetching original ontology: {e}') - return mock_response_500() + return request_ontology(ontology, headers, disableRemovingRedirects) # Failover mode -def fetch_failover(ontology, headers, live=False, monitor=False): - try: - logger.info(f'Fetching original ontology with failover from URL: {ontology}') - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched original ontology') - requested_mime_type = headers.get('Accept', None) # Assuming you set the requested MIME type in the 'Accept' header - response_mime_type = response.headers.get('Content-Type', '').split(';')[0] - logger.info(f'Requested mimetype: {requested_mime_type}') +def fetch_failover(ontology, headers, disableRemovingRedirects): + logger.info(f'Fetching original ontology with failover from URL: {ontology}') + original_response = request_ontology(ontology, headers, disableRemovingRedirects) + if original_response.status_code in passthrough_status_codes: + requested_mimetypes_with_priority = parse_accept_header_with_priority(headers['Accept']) + requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] + response_mime_type = original_response.headers.get('Content-Type', ';').split(';')[0] + logger.info(f'Requested mimetypes: {requested_mimetypes}') logger.info(f'Response mimetype: {response_mime_type}') - if response.status_code in passthrough_status_codes_http and requested_mime_type == response_mime_type: - return response + if response_mime_type in requested_mimetypes: + return original_response else: - logging.info(f'Status code: {response.status_code}') - return fetch_from_dbpedia_archivo_api(ontology, headers) - except Exception as e: - logger.error(f'Error fetching original ontology: {e}') - if live: - logger.info('Attempting to fetch live version due to failover') - return fetch_from_dbpedia_archivo_api(ontology, headers) - elif monitor: - logger.info('Attempting to fetch archive monitor version due to failover') - # TODO - return mock_response_404 - else: - return mock_response_500 - - -def fetch_from_dbpedia_archivo_api(ontology, headers): - format, version, versionMatching = get_parameters_from_headers(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' - try: - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') - response = requests.get(dbpedia_url, timeout=5) - print(response) - return response - except requests.exceptions.RequestException as e: - logging.error(f'Exception occurred while fetching from DBpedia Archivo API: {e}') - return mock_response_404() + logging.info(f'The returned type is not the same as the requested one') + return fetch_latest_archived(ontology, headers) + else: + logger.info(f'The returend status code is not accepted: {original_response.status_code}') + return fetch_latest_archived(ontology, headers) # Fetch the lates version from archivo (no timestamp defined) -def fetch_latest_archive(ontology, headers): - logger.info(f'Fetching latest archive ontology from URL: {ontology}/latest') - try: - response = requests.get(url=ontology, headers=headers, timeout=5) - logger.info('Successfully fetched latest archive ontology') - return response - except Exception as e: - logger.error(f'Error fetching latest archive ontology: {e}') - return mock_response_500 +def fetch_latest_archived(ontology, headers): + logger.info('Fetch latest archived') + format = get_format_from_accept_header(headers) + dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' + logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + return request_ontology(dbpedia_url, headers) + -def fetch_timestamp_archive(ontology, headers): - return mock_response_404 +def fetch_timestamp_archived(ontology, headers, timestamp): + logger.info('Fetch archivo timestamp') + format = get_format_from_accept_header(headers) + dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}&v={timestamp}' + logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + return request_ontology(dbpedia_url, headers) -def fetch_dependency_manifest(ontology, headers): - dependencies_file = "ontologytimemachine/utils/dependency.ttl" - # Parse RDF data from the dependencies file - g = rdflib.Graph() - g.parse(dependencies_file, format="turtle") +def fetch_dependency_manifest(ontology, headers, manifest): + logger.info(f'The dependency manifest is currently not supported') + return mock_response_500 + # # Parse RDF data from the dependencies file + # manifest_g = rdflib.Graph() + # manifest_g.parse(manifest, format="turtle") - version_namespace = rdflib.Namespace("https://example.org/versioning/") + # version_namespace = rdflib.Namespace(ontology) - # Extract dependencies related to the ontology link - ontology = rdflib.URIRef(ontology) + # # Extract dependencies related to the ontology link + # ontology = rdflib.URIRef(ontology) - dependencies = g.subjects(predicate=version_namespace.dependency, object=ontology) + # dependencies = manifest_g.subjects(predicate=version_namespace.dependency, object=ontology) - for dependency in dependencies: - dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) - dep_file = g.value(subject=dependency, predicate=version_namespace.file) - - # Make request to DBpedia archive API - base_api_url = "https://archivo.dbpedia.org/download" + # for dependency in dependencies: + # dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) + # dep_file = g.value(subject=dependency, predicate=version_namespace.file) - if dep_file: - version_param = dep_file.split('v=')[1] - api_url = f"{base_api_url}?o={ontology}&v={version_param}" - else: - api_url = f"{base_api_url}?o={ontology}" + # # Make request to DBpedia archive API + # if dep_file: + # version_param = dep_file.split('v=')[1] + # api_url = f"{dbpedia_api}?o={ontology}&v={version_param}" + # else: + # api_url = f"{dbpedia_api}?o={ontology}" - response = requests.get(api_url) - if response.status_code == 200: - logger.info(f"Successfully fetched {api_url}") - return response - else: - logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") - return mock_response_404 \ No newline at end of file + # response = requests.get(api_url) + # if response.status_code == 200: + # logger.info(f"Successfully fetched {api_url}") + # return response + # else: + # logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") + # return mock_response_404 \ No newline at end of file diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 12786ee..8c48945 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -1,6 +1,6 @@ import logging import argparse -import mimetypes +from werkzeug.http import parse_accept_header logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') @@ -8,9 +8,9 @@ dbpedia_api = 'https://archivo.dbpedia.org/download' +archivo_mimetypes = ['application/rdf+xml', 'application/owl+xml', 'text/turtle', 'application/n-triples'] - -passthrough_status_codes_http = [ +passthrough_status_codes = [ 100, 101, 102, 103, 200, 300, 301, 302, 303, 304, 307, 308, @@ -32,20 +32,20 @@ def parse_arguments(): help='Defines if the Accept Header is patched upstream in original mode.') # Defining ontoVersion argument - parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', - 'latestArchive', 'timestampArchive', 'dependencyManifest'], - default='originalFailoverLive', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') + parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLiveLatest', + 'latestArchived', 'timestampArchived', 'dependencyManifest'], + default='originalFailoverLiveLatest', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') # Enable/disable mode to only proxy requests to ontologies - parser.add_argument('--onlyOntologies', type=bool, default=False, - help='Enable/disable mode to only proxy requests to ontologies.') + parser.add_argument('--restrictedAccess', type=bool, default=False, + help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') # Enable HTTPS interception for specific domains - parser.add_argument('--httpsIntercept', type=str, choices=['none', 'archivo', 'all', 'listfilename'], + parser.add_argument('--httpsInterception', type=str, choices=['none', 'all'], default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') # Enable/disable inspecting or removing redirects - parser.add_argument('--inspectRedirects', type=bool, default=True, + parser.add_argument('--disableRemovingRedirects', type=bool, default=False, help='Enable/disable inspecting or removing redirects.') # Enable/disable proxy forward headers @@ -53,7 +53,27 @@ def parse_arguments(): help='Enable/disable proxy forward headers.') args = parser.parse_args() + + # Check the value of --ontoVersion and prompt for additional arguments if needed + if args.ontoVersion == 'timestampArchived': + args.timestamp = input('Please provide the timestamp (e.g., YYYY-MM-DD): ') + elif args.ontoVersion == 'dependencyManifest': + args.manifest = input('Please provide the manifest file path: ') + # Accessing the arguments + logger.info(f"Selected Ontology Version: {args.ontoVersion}") + if hasattr(args, 'timestamp'): + logger.info(f"Timestamp: {args.timestamp}") + timestamp = args.timestamp + else: + timestamp = None + + if hasattr(args, 'manifest'): + logger.info(f"Manifest File Path: {args.manifest}") + manifest = args.manifest + else: + manifest = None + ontoFormat = { 'format': args.ontoFormat, 'precedence': args.ontoPrecedence, @@ -62,11 +82,11 @@ def parse_arguments(): logger.info(f'Ontology Format: {ontoFormat}') logger.info(f'Ontology Version: {args.ontoVersion}') - logger.info(f'Only Ontologies Mode: {args.onlyOntologies}') - logger.info(f'HTTPS Interception: {args.httpsIntercept}') - logger.info(f'Inspect Redirects: {args.inspectRedirects}') + logger.info(f'Only Ontologies Mode: {args.restrictedAccess}') + logger.info(f'HTTPS Interception: {args.httpsInterception}') + logger.info(f'Inspect Redirects: {args.disableRemovingRedirects}') logger.info(f'Forward Headers: {args.forwardHeaders}') - return ontoFormat, args.ontoVersion, args.onlyOntologies, args.httpsIntercept, args.inspectRedirects, args.forwardHeaders + return ontoFormat, args.ontoVersion, args.restrictedAccess, args.httpsInterception, args.disableRemovingRedirects, args.forwardHeaders, timestamp, manifest def get_mime_type(format='turtle'): @@ -83,53 +103,48 @@ def get_mime_type(format='turtle'): def set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion): - logger.info(f'Setting headers based on ontoFormat: {ontoFormat}') + logger.info(f'Setting headers based on ontoFormat: {ontoFormat} and ontoVersion: {ontoVersion}') + # if ontoVersion is original and patchAcceptUpstream is False nothing to do here + if ontoVersion == 'original' and not ontoFormat['patchAcceptUpstream']: + return + # Determine the correct MIME type for the format mime_type = get_mime_type(ontoFormat['format']) - logger.info(f'Requested mimetype: {mime_type}') - - logger.info(f'Wrapper isconnect: {wrapped_request.is_connect_request()}') + logger.info(f'Requested mimetype by proxy: {mime_type}') + # Define conditions for modifying the accept header request_accept_header = wrapped_request.get_request_accept_header() - - # Check the precedence and update the 'Accept' header if necessary - # Redefine the condition - if ontoFormat['precedence'] in ['always'] or \ - (ontoFormat['precedence'] == 'default' and request_accept_header == '*/*') or \ - request_accept_header == '*/*': - # Needed to make sure the accept header is define + logger.info(f'Accept header by request: {request_accept_header}') + req_headers_with_priority = parse_accept_header_with_priority(request_accept_header) + req_headers = [x[0] for x in req_headers_with_priority] + if not req_headers and ontoFormat['precedence'] in ['default', ['enforcedPriority']]: wrapped_request.set_request_accept_header(mime_type) - - # Check if patchAcceptUpstream is true and ontoVersion is 'original' - if ontoFormat['patchAcceptUpstream'] and ontoVersion == 'original': + elif len(req_headers) == 1 and req_headers[0] == '*/*' and ontoFormat['precedence'] in ['default', 'enforcedPriority']: + wrapped_request.set_request_accept_header(mime_type) + elif len(req_headers) > 1 and mime_type in req_headers and ontoFormat['precedence'] == 'enforcedPriority': wrapped_request.set_request_accept_header(mime_type) + elif ontoFormat['precedence'] == 'always': + wrapped_request.set_request_accept_header(mime_type) + +def select_highest_priority_mime_from_archivo(mime_list): + # Sort the MIME types by their priority in descending order + sorted_mime_list = sorted(mime_list, key=lambda x: x[1], reverse=True) + # Track the highest priority value + highest_priority = sorted_mime_list[0][1] -# def failover_mode(request): -# headers = get_headers(request) -# logger.info('Failover mode') - -# ontology, _, _ = get_ontology_from_request(request) -# try: -# response = requests.get(url=ontology, headers=headers, timeout=5) -# if response.history: -# logger.debug("Request was redirected") -# for resp in response.history: -# logger.debug(f"{resp.status_code}, {resp.url}") -# logger.debug(f"Final destination: {response.status_code}, {response.url}") -# else: -# logger.debug("Request was not redirected") -# content_type = response.headers.get('Content-Type') -# logger.debug(content_type) -# if response.status_code in passthrough_status_codes_http: -# return response -# else: -# logging.info(f'Status code: {response.status_code}') -# return fetch_from_dbpedia_archivo_api(ontology, headers) -# except (SSLError, Timeout, ConnectionError, RequestException) as e: -# return fetch_from_dbpedia_archivo_api(ontology, headers) + # Filter MIME types that match the highest priority + highest_priority_mimes = [mime for mime, priority in sorted_mime_list if priority == highest_priority] + + # Check if any of the highest priority MIME types are in the archivo list + for mime in highest_priority_mimes: + if mime in archivo_mimetypes: + return mime + + # If none of the preferred MIME types are present, return nothing + return None def map_mime_to_format(mime_type): @@ -144,11 +159,35 @@ def map_mime_to_format(mime_type): return mime_to_format.get(mime_type, None) -def get_parameters_from_headers(headers): +def parse_accept_header_with_priority(accept_header): + logger.info('Parse accept header') + # Parse the Accept header to extract MIME types and their priority (q values) + parsed = parse_accept_header(accept_header) + + # Create a list of tuples with MIME types and their corresponding q values + mime_types_with_priority = [(item[0], item[1]) for item in parsed] + logger.info(f'Accept headers with priority: {mime_types_with_priority}') + + return mime_types_with_priority + + +def get_format_from_accept_header(headers): + if not headers: + return None + # Map MIME types to formats - mime_type = headers.get('Accept', None) - format = map_mime_to_format(mime_type) + accept_header = headers.get('Accept', None) + logger.info(f'Accept header: {accept_header}') + if not accept_header: + return None + + accept_header_with_priority = parse_accept_header_with_priority(accept_header) + + selected_mimetype = select_highest_priority_mime_from_archivo(accept_header_with_priority) - version = headers.get('Version', None) - versionMatching = headers.get('VersionMatching', None) - return format, version, versionMatching \ No newline at end of file + if not selected_mimetype: + logger.info(f'The requested mimetype is not supported by DBpedia Archivo') + return None + + format = map_mime_to_format(selected_mimetype) + return format \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 5eb6572..561df6c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -171,6 +171,75 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "markupsafe" +version = "2.1.5" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, + {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, +] + [[package]] name = "packaging" version = "24.0" @@ -325,7 +394,24 @@ h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] +[[package]] +name = "werkzeug" +version = "3.0.4" +description = "The comprehensive WSGI web application library." +optional = false +python-versions = ">=3.8" +files = [ + {file = "werkzeug-3.0.4-py3-none-any.whl", hash = "sha256:02c9eb92b7d6c06f31a782811505d2157837cea66aaede3e217c7c27c039476c"}, + {file = "werkzeug-3.0.4.tar.gz", hash = "sha256:34f2371506b250df4d4f84bfe7b0921e4762525762bbd936614909fe25cd7306"}, +] + +[package.dependencies] +MarkupSafe = ">=2.1.1" + +[package.extras] +watchdog = ["watchdog (>=2.3)"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "202968486827cf46664fd533592fac8667cb3bc2b20ed820d397b7f13243acf2" +content-hash = "aa072190e1a5c335c379c9f3ab09b14dfcf718050b38b08441ba2a91ffefd935" diff --git a/pyproject.toml b/pyproject.toml index 9a287ab..0232beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ pytest = "^8.2.1" requests = "^2.32.3" proxy-py = "^2.4.4" rdflib = "^7.0.0" +werkzeug = "^3.0.4" [build-system] diff --git a/tests/test_unit.py b/tests/test_unit.py index 578f618..9cd7856 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -12,11 +12,11 @@ from ontologytimemachine.utils.utils import ( parse_arguments, map_mime_to_format, - get_parameters_from_headers + get_format_from_accept_header ) from ontologytimemachine.utils.proxy_logic import ( - fetch_from_dbpedia_archivo_api + fetch_latest_archived ) class TestUtils(unittest.TestCase): @@ -28,9 +28,9 @@ def test_parse_arguments(self, mock_parse_args): ontoPrecedence='enforcedPriority', patchAcceptUpstream=False, ontoVersion='originalFailoverLive', - onlyOntologies=True, - httpsIntercept=False, - inspectRedirects=True, + restrictedAccess=True, + httpsInterception=False, + disableRemovingRedirects=True, forwardHeaders=True ) @@ -50,9 +50,9 @@ def test_parse_arguments(self, mock_parse_args): ontoPrecedence='default', patchAcceptUpstream=True, ontoVersion='latestArchive', - onlyOntologies=False, - httpsIntercept=True, - inspectRedirects=False, + restrictedAccess=False, + httpsInterception=True, + disableRemovingRedirects=False, forwardHeaders=False ) @@ -69,7 +69,7 @@ def test_parse_arguments(self, mock_parse_args): @patch('requests.get') - def test_fetch_from_dbpedia_archivo_api(self, mock_get): + def test_fetch_latest_archived(self, mock_get): mock_response = Mock() mock_response.status_code = 200 mock_get.return_value = mock_response @@ -77,11 +77,11 @@ def test_fetch_from_dbpedia_archivo_api(self, mock_get): ontology = 'http://dbpedia.org/ontology/Person' headers = {'Accept': 'text/turtle'} - response = fetch_from_dbpedia_archivo_api(ontology, headers) + response = fetch_latest_archived(ontology, headers) self.assertEqual(response.status_code, 200) mock_get.side_effect = requests.exceptions.RequestException - response = fetch_from_dbpedia_archivo_api(ontology, headers) + response = fetch_latest_archived(ontology, headers) self.assertEqual(response.status_code, 404) def test_map_mime_to_format(self): @@ -90,27 +90,19 @@ def test_map_mime_to_format(self): self.assertEqual(map_mime_to_format('application/n-triples'), 'nt') self.assertIsNone(map_mime_to_format('unknown/mime')) - def test_get_parameters_from_headers(self): - headers = { - 'Accept': 'application/rdf+xml', - 'Version': '1.0', - 'VersionMatching': 'exact' - } - format, version, versionMatching = get_parameters_from_headers(headers) - self.assertEqual(format, 'owl') - self.assertEqual(version, '1.0') - self.assertEqual(versionMatching, 'exact') + def test_get_format_from_accept_header(self): + headers = {'Accept': 'application/json'} + format = get_format_from_accept_header(headers) + self.assertEqual(format, None) - headers = { - 'Accept': 'unknown/mime', - 'Version': '2.0', - 'VersionMatching': 'compatible' - } - format, version, versionMatching = get_parameters_from_headers(headers) - self.assertIsNone(format) - self.assertEqual(version, '2.0') - self.assertEqual(versionMatching, 'compatible') + headers = {} + format = get_format_from_accept_header(headers) + self.assertIsNone(format, None) + + headers = {'Accept': 'text/turtle'} + format = get_format_from_accept_header(headers) + self.assertEqual(format, 'ttl') class TestMockResponses(unittest.TestCase): From 1bc4e897742d00c029f102b875879bb1c3b394bf Mon Sep 17 00:00:00 2001 From: JJ-Author Date: Mon, 9 Sep 2024 20:48:16 +0200 Subject: [PATCH 09/35] fixed Dockerfie to support cmd parameters easily --- Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index bc75693..21f939a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,6 +21,4 @@ RUN pip install poetry==$POETRY_VERSION RUN poetry config virtualenvs.create false RUN poetry install --no-dev && rm pyproject.toml - -CMD python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname 0.0.0.0 --port $PORT --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin - +ENTRYPOINT ["python3", "ontologytimemachine/custom_proxy.py"] \ No newline at end of file From 56c4d7a2680022e8671d39f0f7baf32ae7e78500 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Mon, 9 Sep 2024 21:50:30 +0200 Subject: [PATCH 10/35] only add the parameters for the certificate parameters if https interception is enabled --- ontologytimemachine/custom_proxy.py | 19 ++++----- poetry.lock | 62 ++++++++++++++++------------- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 0e74b4e..8baf446 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -16,7 +16,7 @@ IP = '0.0.0.0' PORT = '8899' -config = None +config = ({'format': 'turtle', 'precedence': 'enforcedPriority', 'patchAcceptUpstream': False}, 'originalFailoverLiveLatest', False, 'all', False, True, None, None) logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @@ -27,7 +27,7 @@ def __init__(self, *args, **kwargs): (self.ontoFormat, self.ontoVersion, self.restrictedAccess, self.httpsInterception, self.disableRemovingRedirects, self.forward_headers, self.timestamp, self.manifest) = config - logger.info(config) + logger.info() def before_upstream_connection(self, request: HttpParser): logger.info('Before upstream connection hook') @@ -98,18 +98,19 @@ def queue_response(self, response): sys.argv = [sys.argv[0]] # TODO: fix this - sys.argv += [ - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', - ] + # check it https interception is enabled + if config[3] != 'none': + sys.argv += [ + '--ca-key-file', 'ca-key.pem', + '--ca-cert-file', 'ca-cert.pem', + '--ca-signing-key-file', 'ca-signing-key.pem', + ] + sys.argv += [ '--hostname', IP, '--port', PORT, '--plugins', __name__ + '.OntologyTimeMachinePlugin' ] - print(sys.argv) - logger.info("Starting OntologyTimeMachineProxy server...") proxy.main() \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 561df6c..6b8cef0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2,13 +2,13 @@ [[package]] name = "certifi" -version = "2024.6.2" +version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.6.2-py3-none-any.whl", hash = "sha256:ddc6c8ce995e6987e7faf5e3f1b02b302836a0e5d98ece18392cb1a36c72ad56"}, - {file = "certifi-2024.6.2.tar.gz", hash = "sha256:3cd43f1c6fa7dedc5899d69d3ad0398fd018ad1a17fba83ddaf78aa46c747516"}, + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, ] [[package]] @@ -123,13 +123,13 @@ files = [ [[package]] name = "exceptiongroup" -version = "1.2.1" +version = "1.2.2" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" files = [ - {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, - {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, ] [package.extras] @@ -137,13 +137,13 @@ test = ["pytest (>=6)"] [[package]] name = "idna" -version = "3.7" +version = "3.8" description = "Internationalized Domain Names in Applications (IDNA)" optional = false -python-versions = ">=3.5" +python-versions = ">=3.6" files = [ - {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"}, - {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, + {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"}, + {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, ] [[package]] @@ -242,13 +242,13 @@ files = [ [[package]] name = "packaging" -version = "24.0" +version = "24.1" description = "Core utilities for Python packages" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, - {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] [[package]] @@ -268,24 +268,30 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "proxy-py" -version = "2.4.4" -description = "⚡ Fast • 🪶 Lightweight • 0️⃣ Dependency • 🔌 Pluggable • 😈 TLS interception • 🔒 DNS-over-HTTPS • 🔥 Poor Mans VPN • ⏪ Reverse & ⏩ Forward • 👮🏿 Proxy Server framework • 🌐 Web Server framework • ➵ ➶ ➷ ➠ PubSub framework • 👷 Work acceptor & executor framework." +version = "2.4.7" +description = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework." optional = false python-versions = ">=3.6" files = [ - {file = "proxy.py-2.4.4-py3-none-any.whl", hash = "sha256:8d02fa2f1f94ad5bee96962c7b02badb9b4075d1a99d31700577ea15713ba4d3"}, - {file = "proxy_py-2.4.4.tar.gz", hash = "sha256:216581f70ad673f4ecb5f6b27f52491aaf1c056829f4a670f5ea3b5a340f4272"}, + {file = "proxy.py-2.4.7-py3-none-any.whl", hash = "sha256:83ddfda5479403434eace531c2bdef41fd9091df473a4051cd9df1564de056a9"}, + {file = "proxy_py-2.4.7.tar.gz", hash = "sha256:2e20ad717025cdee92d528be1321b7af8743d941e56de2ae6f390c6dc67aaad1"}, ] +[package.extras] +metrics = ["prometheus-client (==0.17.1)", "prometheus-client (==0.20.0)"] +release = ["setuptools-scm (==6.3.2)", "twine (==3.8.0)"] +testing = ["autopep8 (==1.6.0)", "coverage (==6.2)", "coverage (==7.4.4)", "flake8 (==4.0.1)", "h2 (==4.1.0)", "hpack (==4.0.0)", "httpx (==0.22.0)", "httpx (==0.27.0)", "hyperframe (==6.0.1)", "mccabe (==0.6.1)", "mypy (==0.971)", "pre-commit (==2.16.0)", "py-spy (==0.3.12)", "pylint (==2.13.7)", "pylint (==3.1.0)", "pytest (==7.0.1)", "pytest (==8.1.1)", "pytest-asyncio (==0.16.0)", "pytest-asyncio (==0.21.1)", "pytest-cov (==3.0.0)", "pytest-cov (==5.0.0)", "pytest-mock (==3.14.0)", "pytest-mock (==3.6.1)", "pytest-xdist (==2.5.0)", "pytest-xdist (==3.5.0)", "python-coveralls (==2.9.3)", "rope (==1.1.1)", "tox (==3.28.0)", "tox (==4.14.2)", "types-requests (==2.28.11.5)", "types-setuptools (==64.0.1)", "wheel (==0.37.1)"] +tunnel = ["cryptography (==36.0.2)", "cryptography (==39.0.1)", "paramiko (==2.11.0)", "paramiko (==3.4.0)", "types-paramiko (==2.11.3)", "types-paramiko (==3.4.0.20240311)"] + [[package]] name = "pyparsing" -version = "3.1.2" +version = "3.1.4" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" files = [ - {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, - {file = "pyparsing-3.1.2.tar.gz", hash = "sha256:a1bac0ce561155ecc3ed78ca94d3c9378656ad4c94c1270de543f621420f94ad"}, + {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"}, + {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"}, ] [package.extras] @@ -293,13 +299,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "8.2.2" +version = "8.3.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, - {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, + {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, + {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, ] [package.dependencies] @@ -307,7 +313,7 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.5,<2.0" +pluggy = ">=1.5,<2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] @@ -379,13 +385,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.1" +version = "2.2.2" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.2.1-py3-none-any.whl", hash = "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d"}, - {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, + {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, + {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, ] [package.extras] From 700022ed169cca78ef4b272db62b288b568ed54a Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 15:17:09 +0200 Subject: [PATCH 11/35] fix log --- ontologytimemachine/custom_proxy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 8baf446..bed6db2 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -23,11 +23,11 @@ class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): + logger.info('Init') super().__init__(*args, **kwargs) (self.ontoFormat, self.ontoVersion, self.restrictedAccess, self.httpsInterception, self.disableRemovingRedirects, self.forward_headers, self.timestamp, self.manifest) = config - logger.info() def before_upstream_connection(self, request: HttpParser): logger.info('Before upstream connection hook') @@ -96,7 +96,7 @@ def queue_response(self, response): config = parse_arguments() - sys.argv = [sys.argv[0]] # TODO: fix this + sys.argv = [sys.argv[0]] # check it https interception is enabled if config[3] != 'none': From 9966b5e0a29d3a5427f8dd067e87718b6322cccc Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 15:41:34 +0200 Subject: [PATCH 12/35] adding -v to run the tests --- tests/test_integration.py | 83 +++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index 2236854..43f04a8 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -37,77 +37,76 @@ def start_proxy_server(): process.wait() -# def test_babelnet(): -# iri = 'http://babelnet.org/rdf/' -# generic_test(iri, 'text/turtle') - +def test_1_babelnet(): + iri = 'http://babelnet.org/rdf/' + generic_test(iri, 'text/turtle') -# def test_bag_basisregistraties(): -# iri = 'http://bag.basisregistraties.overheid.nl/def/bag' -# generic_test(iri, 'text/turtle') +def test_2_bag_basisregistraties(): + iri = 'http://bag.basisregistraties.overheid.nl/def/bag' + generic_test(iri, 'text/turtle') -# def test_bblfish(): -# iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' -# generic_test(iri, 'text/turtle') +def test_3_bblfish(): + iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' + generic_test(iri, 'text/turtle') -# def test_brk_basisregistraties(): -# iri = 'http://brk.basisregistraties.overheid.nl/def/brk' -# generic_test(iri, 'text/turtle') +def test_4_brk_basisregistraties(): + iri = 'http://brk.basisregistraties.overheid.nl/def/brk' + generic_test(iri, 'text/turtle') -# def test_brt_basisregistraties(): -# iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' -# generic_test(iri, 'text/turtle') +def test_5_brt_basisregistraties(): + iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' + generic_test(iri, 'text/turtle') -# def test_brt_basisregistraties_begrippenkader(): -# iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' -# generic_test(iri, 'text/turtle') +def test_6_brt_basisregistraties_begrippenkader(): + iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' + generic_test(iri, 'text/turtle') -# def test_buzzword(): -# iri = 'http://buzzword.org.uk/rdf/personal-link-types#' -# generic_test(iri, 'text/turtle') +def test_7_buzzword(): + iri = 'http://buzzword.org.uk/rdf/personal-link-types#' + generic_test(iri, 'text/turtle') -# def test_catalogus_professorum(): -# iri = 'http://catalogus-professorum.org/cpm/2/' -# generic_test(iri, 'text/turtle') +def test_8_catalogus_professorum(): + iri = 'http://catalogus-professorum.org/cpm/2/' + generic_test(iri, 'text/turtle') -# def test_data_gov(): -# iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' -# generic_test(iri, 'text/turtle') +def test_9_data_gov(): + iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' + generic_test(iri, 'text/turtle') -# def test_data_bigdatagrapes(): -# iri = 'http://data.bigdatagrapes.eu/resource/ontology/' -# generic_test(iri, 'text/turtle') +def test_10_data_bigdatagrapes(): + iri = 'http://data.bigdatagrapes.eu/resource/ontology/' + generic_test(iri, 'text/turtle') -# def test_data_europa_esco(): -# iri = 'http://data.europa.eu/esco/flow' -# generic_test(iri, 'text/turtle') +def test_11_data_europa_esco(): + iri = 'http://data.europa.eu/esco/flow' + generic_test(iri, 'text/turtle') -# def test_data_globalchange(): -# iri = 'http://data.globalchange.gov/gcis.owl' -# generic_test(iri, 'text/turtle') +def test_12_data_globalchange(): + iri = 'http://data.globalchange.gov/gcis.owl' + generic_test(iri, 'text/turtle') -def test_data_ontotext(): +def test_13_data_ontotext(): iri = 'http://data.ontotext.com/resource/leak/' generic_test(iri, 'text/turtle') -# def test_data_ordnancesurvey_50kGazetteer(): -# iri = 'http://dbpedia.org/ontology/Person' -# generic_test(iri, 'text/turtle') +def test_14_data_ordnancesurvey_50kGazetteer(): + iri = 'http://dbpedia.org/ontology/Person' + generic_test(iri, 'text/turtle') -def test_linked_web_apis(): +def test_15_linked_web_apis(): iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' generic_test(iri, 'text/turtle') From dbb96ca64b805dad22acf47e1095829fe9c96223 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 15:45:08 +0200 Subject: [PATCH 13/35] adding -v to run the tests --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 6fbcf65..82a5be1 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -31,4 +31,4 @@ jobs: echo "$CA_SIGNING_KEY" > ca-signing-key.pem - name: Test with pytest run: | - poetry run pytest + poetry run pytest -v From a5f872fe7454a1c95d0ce2a0766a5d8999d52804 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 17:21:42 +0200 Subject: [PATCH 14/35] fix testcases --- .github/workflows/pytest.yml | 3 ++ tests/test_integration.py | 54 ++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 82a5be1..874fe8d 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,6 +29,9 @@ jobs: echo "$CA_CERT" > ca-cert.pem echo "$CA_KEY" > ca-key.pem echo "$CA_SIGNING_KEY" > ca-signing-key.pem + - name: Start the proxy + run: | + python3 ontologytimemachine/custom_proxy.py & - name: Test with pytest run: | poetry run pytest -v diff --git a/tests/test_integration.py b/tests/test_integration.py index 43f04a8..e7a2ef6 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -16,25 +16,35 @@ CA_CERT_PATH = "ca-cert.pem" -@pytest.fixture(scope="module", autouse=True) -def start_proxy_server(): - # Start the proxy server in a subprocess - process = subprocess.Popen( - [ - 'python3', 'ontologytimemachine/custom_proxy.py', - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) +# @pytest.fixture(scope="module", autouse=True) +# def start_proxy_server(): +# # Start the proxy server in a subprocess +# process = subprocess.Popen( +# [ +# 'python3', 'ontologytimemachine/custom_proxy.py', +# ], +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE +# ) - # Wait a bit to ensure the server starts - time.sleep(5) +# # Wait a bit to ensure the server starts +# time.sleep(5) - yield - "http://0.0.0.0:8899" - # Terminate the proxy server after tests - process.terminate() - process.wait() +# yield +# "http://0.0.0.0:8899" +# # Terminate the proxy server after tests +# process.terminate() +# process.wait() + + +def test_12_data_globalchange(): + iri = 'http://data.globalchange.gov/gcis.owl' + generic_test(iri, 'text/turtle') + + +def test_13_data_ontotext(): + iri = 'http://data.ontotext.com/resource/leak/' + generic_test(iri, 'text/turtle') def test_1_babelnet(): @@ -91,16 +101,6 @@ def test_11_data_europa_esco(): generic_test(iri, 'text/turtle') -def test_12_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') - - -def test_13_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') - - def test_14_data_ordnancesurvey_50kGazetteer(): iri = 'http://dbpedia.org/ontology/Person' generic_test(iri, 'text/turtle') From 37112253123a5148f3192598c55559102ffdfef1 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 17:24:39 +0200 Subject: [PATCH 15/35] start poetry from workflow --- .github/workflows/pytest.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 874fe8d..53c9b9c 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,6 +29,9 @@ jobs: echo "$CA_CERT" > ca-cert.pem echo "$CA_KEY" > ca-key.pem echo "$CA_SIGNING_KEY" > ca-signing-key.pem + - name: Start poetry venv + run: | + poetry shell - name: Start the proxy run: | python3 ontologytimemachine/custom_proxy.py & From f6814ab6725b8ac9cee9a16baf51f14bc1095f6a Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 10 Sep 2024 17:28:59 +0200 Subject: [PATCH 16/35] fixes --- .github/workflows/pytest.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 53c9b9c..34b322a 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,12 +29,9 @@ jobs: echo "$CA_CERT" > ca-cert.pem echo "$CA_KEY" > ca-key.pem echo "$CA_SIGNING_KEY" > ca-signing-key.pem - - name: Start poetry venv - run: | - poetry shell - name: Start the proxy run: | - python3 ontologytimemachine/custom_proxy.py & + poetry run python ontologytimemachine/custom_proxy.py & - name: Test with pytest run: | poetry run pytest -v From 415fceb2d4a0108850fbc7b42c79dc0239c12c6f Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 27 Sep 2024 14:25:59 +0200 Subject: [PATCH 17/35] fixing some comments --- ontologytimemachine/custom_proxy.py | 10 ++--- ontologytimemachine/proxy_wrapper.py | 46 +++++++++++----------- ontologytimemachine/utils/proxy_logic.py | 49 ++++++++++++++++-------- ontologytimemachine/utils/utils.py | 2 +- poetry.lock | 27 +++++++------ 5 files changed, 76 insertions(+), 58 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index bed6db2..bb86de8 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -4,8 +4,8 @@ from ontologytimemachine.utils.utils import parse_arguments from ontologytimemachine.utils.mock_responses import mock_response_403 from ontologytimemachine.proxy_wrapper import HttpRequestWrapper -from ontologytimemachine.utils.proxy_logic import proxy_logic, is_ontology_request_only_ontology -from ontologytimemachine.utils.proxy_logic import is_archivo_ontology_request +from ontologytimemachine.utils.proxy_logic import proxy_logic, is_archivo_ontology_request +from ontologytimemachine.utils.proxy_logic import do_deny_request_due_non_archivo_ontology_uri from ontologytimemachine.utils.proxy_logic import if_intercept_host from http.client import responses import proxy @@ -46,7 +46,7 @@ def before_upstream_connection(self, request: HttpParser): return None # If only ontology mode, return None in all other cases - if is_ontology_request_only_ontology(wrapped_request, self.restrictedAccess): + if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, self.restrictedAccess): logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') self.queue_response(mock_response_403) return None @@ -56,6 +56,7 @@ def before_upstream_connection(self, request: HttpParser): response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) self.queue_response(response) return None + return request def handle_client_request(self, request: HttpParser): @@ -66,8 +67,7 @@ def handle_client_request(self, request: HttpParser): if wrapped_request.is_connect_request(): return request - is_ontology_request = is_archivo_ontology_request(wrapped_request) - if not is_ontology_request: + if not do_deny_request_due_non_archivo_ontology_uri(wrapped_request): logger.info('The requested IRI is not part of DBpedia Archivo') return request diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index fe5d3db..69779b7 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -1,14 +1,15 @@ from abc import ABC, abstractmethod from proxy.http.parser import HttpParser import logging +from typing import Tuple, Dict, Any - +# Configure logger logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) class AbstractRequestWrapper(ABC): - def __init__(self, request): + def __init__(self, request: Any) -> None: self.request = request @abstractmethod @@ -28,74 +29,71 @@ def is_https_request(self) -> bool: pass @abstractmethod - def get_request(self): + def get_request(self) -> Any: pass @abstractmethod - def get_request_headers(self): + def get_request_headers(self) -> Dict[str, str]: pass @abstractmethod - def get_request_accept_header(self): + def get_request_accept_header(self) -> str: pass @abstractmethod - def set_request_accept_header(self, mime_type): + def set_request_accept_header(self, mime_type: str) -> None: pass @abstractmethod - def get_ontology_from_request(self): + def get_ontology_iri_host_path_from_request(self) -> Tuple[str, str, str]: pass class HttpRequestWrapper(AbstractRequestWrapper): - def __init__(self, request: HttpParser): + def __init__(self, request: HttpParser) -> None: super().__init__(request) def is_get_request(self) -> bool: return self.request.method == b'GET' - def is_connect_request(self): + def is_connect_request(self) -> bool: return self.request.method == b'CONNECT' - def is_head_request(self): + def is_head_request(self) -> bool: return self.request.method == b'HEAD' - def is_https_request(self): + def is_https_request(self) -> bool: return self.request.method == b'CONNECT' or self.request.headers.get(b'Host', b'').startswith(b'https') - def get_request(self): + def get_request(self) -> HttpParser: return self.request - def get_request_headers(self): - headers = {} + def get_request_headers(self) -> Dict[str, str]: + headers: Dict[str, str] = {} for k, v in self.request.headers.items(): headers[v[0].decode('utf-8')] = v[1].decode('utf-8') return headers - def get_request_accept_header(self): + def get_request_accept_header(self) -> str: logger.info('Wrapper - get_request_accept_header') return self.request.headers[b'accept'][1].decode('utf-8') - def set_request_accept_header(self, mime_type): + def set_request_accept_header(self, mime_type: str) -> None: self.request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) logger.info(f'Accept header set to: {self.request.headers[b"accept"][1]}') - def get_ontology_from_request(self): + def get_ontology_iri_host_path_from_request(self) -> Tuple[str, str, str]: logger.info('Get ontology from request') - print(f'Request protocol: {self.request.protocol}') - print(f'Request host: {self.request.host}') - print(f'Request _url: {self.request._url}') - print(f'Request path: {self.request.path}') - if (self.request.method == b'GET' or self.request.method == b'HEAD') and not self.request.host: + if (self.request.method in {b'GET', b'HEAD'}) and not self.request.host: for k, v in self.request.headers.items(): if v[0].decode('utf-8') == 'Host': host = v[1].decode('utf-8') path = self.request.path.decode('utf-8') - ontology = 'https://' + host + path + ontology = f'https://{host}{path}' else: host = self.request.host.decode('utf-8') path = self.request.path.decode('utf-8') ontology = str(self.request._url) + logger.info(f'Ontology: {ontology}') - return ontology, host, path \ No newline at end of file + return ontology, host, path diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 0837dba..9e0b4c8 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -2,42 +2,59 @@ import requests import rdflib from urllib.parse import urlparse - from ontologytimemachine.utils.utils import set_onto_format_headers, get_format_from_accept_header from ontologytimemachine.utils.utils import parse_accept_header_with_priority from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes from ontologytimemachine.utils.mock_responses import mock_response_500 from ontologytimemachine.utils.mock_responses import mock_response_404 +from typing import Set, Tuple logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) +ARCHIVO_PARSED_URLS: Set[Tuple[str, str]] = set() + + def if_intercept_host(https_intercept): if https_intercept in ['all']: return True return False -def is_ontology_request_only_ontology(wrapped_request, only_ontologies): - is_archivo_ontology = is_archivo_ontology_request(wrapped_request) - if only_ontologies and not is_archivo_ontology: - return True +def do_deny_request_due_non_archivo_ontology_uri (wrapped_request, only_ontologies): + if only_ontologies: + is_archivo_ontology = is_archivo_ontology_request(wrapped_request) + if not is_archivo_ontology: + return True return False -def is_archivo_ontology_request(wrapped_request): - logger.info('Chekc if the requested ontology is in archivo') - with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: - urls = [line.strip() for line in file] - parsed_urls = [(urlparse(url).netloc, urlparse(url).path) for url in urls] +def load_archivo_urls() -> None: + """Load the archivo URLs into the global variable if not already loaded.""" + global ARCHIVO_PARSED_URLS + if not ARCHIVO_PARSED_URLS: # Load only if the set is empty + logger.info('Loading archivo ontologies from file') + with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: + ARCHIVO_PARSED_URLS = { + (urlparse(line.strip()).netloc, urlparse(line.strip()).path) for line in file + } - _, request_host, request_path = wrapped_request.get_ontology_from_request() - for host, path in parsed_urls: - if request_host == host and request_path.startswith(path): - return True - return False + +def is_archivo_ontology_request(wrapped_request) -> bool: + """Check if the requested ontology is in the archivo.""" + logger.info('Check if the requested ontology is in archivo') + + # Ensure the archivo URLs are loaded + load_archivo_urls() + + # Extract the request's host and path + request_host = wrapped_request.get_request().host.decode('utf-8') + request_path = wrapped_request.get_request().path.decode('utf-8') + + # Check if the (host, path) tuple exists in ARCHIVO_PARSED_URLS + return (request_host, request_path) in ARCHIVO_PARSED_URLS def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): @@ -57,7 +74,7 @@ def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirec set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) headers = wrapped_request.get_request_headers() - ontology, _, _ = wrapped_request.get_ontology_from_request() + ontology, _, _ = wrapped_request.get_ontology_iri_host_path_from_request() # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 8c48945..31ad83d 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -41,7 +41,7 @@ def parse_arguments(): help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') # Enable HTTPS interception for specific domains - parser.add_argument('--httpsInterception', type=str, choices=['none', 'all'], + parser.add_argument('--httpsInterception', type=str, choices=['none', 'all', 'block'], default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') # Enable/disable inspecting or removing redirects diff --git a/poetry.lock b/poetry.lock index 6b8cef0..e0689c0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -137,15 +137,18 @@ test = ["pytest (>=6)"] [[package]] name = "idna" -version = "3.8" +version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.6" files = [ - {file = "idna-3.8-py3-none-any.whl", hash = "sha256:050b4e5baadcd44d760cedbd2b8e639f2ff89bbc7a5730fcc662954303377aac"}, - {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, ] +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + [[package]] name = "iniconfig" version = "2.0.0" @@ -268,13 +271,13 @@ testing = ["pytest", "pytest-benchmark"] [[package]] name = "proxy-py" -version = "2.4.7" +version = "2.4.8" description = "\\u26a1 Fast \\u2022 \\U0001fab6 Lightweight \\u2022 \\U0001f51f Dependency \\u2022 \\U0001f50c Pluggable \\u2022 \\U0001f608 TLS interception \\u2022 \\U0001f512 DNS-over-HTTPS \\u2022 \\U0001f525 Poor Mans VPN \\u2022 \\u23ea Reverse & \\u23e9 Forward \\u2022 \\U0001f46e\\U0001f3ff Proxy Server framework \\u2022 \\U0001f310 Web Server framework \\u2022 \\u27b5 \\u27b6 \\u27b7 \\u27a0 PubSub framework \\u2022 \\U0001f477 Work acceptor & executor framework." optional = false python-versions = ">=3.6" files = [ - {file = "proxy.py-2.4.7-py3-none-any.whl", hash = "sha256:83ddfda5479403434eace531c2bdef41fd9091df473a4051cd9df1564de056a9"}, - {file = "proxy_py-2.4.7.tar.gz", hash = "sha256:2e20ad717025cdee92d528be1321b7af8743d941e56de2ae6f390c6dc67aaad1"}, + {file = "proxy.py-2.4.8-py3-none-any.whl", hash = "sha256:316cbed3184c8ddf4f9b3143f7dc449ef1d44a7c5ca1988276a01444f6426e51"}, + {file = "proxy_py-2.4.8.tar.gz", hash = "sha256:77088312aa558c9402af2b88d135a1e261af51f5e38242f1d37867559a0a65cb"}, ] [package.extras] @@ -299,13 +302,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pytest" -version = "8.3.2" +version = "8.3.3" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.3.2-py3-none-any.whl", hash = "sha256:4ba08f9ae7dcf84ded419494d229b48d0903ea6407b030eaec46df5e6a73bba5"}, - {file = "pytest-8.3.2.tar.gz", hash = "sha256:c132345d12ce551242c87269de812483f5bcc87cdbb4722e48487ba194f9fdce"}, + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, ] [package.dependencies] @@ -385,13 +388,13 @@ files = [ [[package]] name = "urllib3" -version = "2.2.2" +version = "2.2.3" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" files = [ - {file = "urllib3-2.2.2-py3-none-any.whl", hash = "sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472"}, - {file = "urllib3-2.2.2.tar.gz", hash = "sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168"}, + {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, + {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, ] [package.extras] From 2bf7e6da0635beb229408c9846ab11a03f91aaa9 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 4 Oct 2024 07:23:07 +0200 Subject: [PATCH 18/35] add poetry instalation to README --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 65c33ca..f335563 100644 --- a/README.md +++ b/README.md @@ -36,5 +36,14 @@ cp ca-signing-key.pem ~/ontology-time-machine/ca-signing-key.pem - curl -x http://0.0.0.0:8899 -H "Accept: text/turtle" --cacert ca-cert.pem http://ontologi.es/days# +### Install poetry virtual environment +``` +poetry install +``` + +### Activate poetry environment +``` +poetry shell +``` python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname IP --port 8899 --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin --ontoFormat ntriples --ontoVersion originalFailoverLive --ontoPrecedence enforcedPriority \ No newline at end of file From 7b4c919a998fe87b8e9a9b4c1e98c8a9915b50d9 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 4 Oct 2024 07:33:05 +0200 Subject: [PATCH 19/35] rename get_ontology_from_request funtion --- ontologytimemachine/proxy_wrapper.py | 12 ++++++------ ontologytimemachine/utils/proxy_logic.py | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index 69779b7..0e897ea 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -45,7 +45,7 @@ def set_request_accept_header(self, mime_type: str) -> None: pass @abstractmethod - def get_ontology_iri_host_path_from_request(self) -> Tuple[str, str, str]: + def get_request_url_host_path(self) -> Tuple[str, str, str]: pass @@ -82,18 +82,18 @@ def set_request_accept_header(self, mime_type: str) -> None: self.request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) logger.info(f'Accept header set to: {self.request.headers[b"accept"][1]}') - def get_ontology_iri_host_path_from_request(self) -> Tuple[str, str, str]: + def get_request_url_host_path(self) -> Tuple[str, str, str]: logger.info('Get ontology from request') if (self.request.method in {b'GET', b'HEAD'}) and not self.request.host: for k, v in self.request.headers.items(): if v[0].decode('utf-8') == 'Host': host = v[1].decode('utf-8') path = self.request.path.decode('utf-8') - ontology = f'https://{host}{path}' + url = f'https://{host}{path}' else: host = self.request.host.decode('utf-8') path = self.request.path.decode('utf-8') - ontology = str(self.request._url) + url = str(self.request._url) - logger.info(f'Ontology: {ontology}') - return ontology, host, path + logger.info(f'Ontology: {url}') + return url, host, path diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 9e0b4c8..93cfb73 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -74,7 +74,7 @@ def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirec set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) headers = wrapped_request.get_request_headers() - ontology, _, _ = wrapped_request.get_ontology_iri_host_path_from_request() + ontology, _, _ = wrapped_request.get_request_url_host_path() # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through From 2d3ceaa0d1781f4a6f82cba023e70c9401c8aeb9 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Fri, 4 Oct 2024 08:15:09 +0200 Subject: [PATCH 20/35] transform config to dict from tuple --- ontologytimemachine/custom_proxy.py | 49 +++++++++++------------- ontologytimemachine/utils/proxy_logic.py | 27 ++++++++++++- ontologytimemachine/utils/utils.py | 14 ++++++- 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index bb86de8..41562f9 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -16,7 +16,7 @@ IP = '0.0.0.0' PORT = '8899' -config = ({'format': 'turtle', 'precedence': 'enforcedPriority', 'patchAcceptUpstream': False}, 'originalFailoverLiveLatest', False, 'all', False, True, None, None) +config = None logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) @@ -25,9 +25,7 @@ class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): logger.info('Init') super().__init__(*args, **kwargs) - (self.ontoFormat, self.ontoVersion, self.restrictedAccess, - self.httpsInterception, self.disableRemovingRedirects, - self.forward_headers, self.timestamp, self.manifest) = config + self.config = config def before_upstream_connection(self, request: HttpParser): logger.info('Before upstream connection hook') @@ -35,29 +33,22 @@ def before_upstream_connection(self, request: HttpParser): wrapped_request = HttpRequestWrapper(request) if wrapped_request.is_connect_request(): - logger.info(f'HTTPS interception mode: {self.httpsInterception}') + logger.info(f'HTTPS interception mode: {self.config["httpsInterception"]}') # Only intercept if interception is enabled # Move this to the utils - if if_intercept_host(self.httpsInterception): + if if_intercept_host(self.config["httpsInterception"]): logger.info('HTTPS interception is on, forwardig the request') return request else: logger.info('HTTPS interception is turned off') return None - # If only ontology mode, return None in all other cases - if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, self.restrictedAccess): - logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') - self.queue_response(mock_response_403) - return None - - if is_archivo_ontology_request(wrapped_request): - logger.debug('The request is for an ontology') - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) - self.queue_response(response) - return None - - return request + # # If only ontology mode, return None in all other cases + # response = get_response_from_request(wrapped_request, config) + # if response: + # self.queue_response(mock_response_403) + # return None + # return request def handle_client_request(self, request: HttpParser): logger.info('Handle client request hook') @@ -65,17 +56,23 @@ def handle_client_request(self, request: HttpParser): wrapped_request = HttpRequestWrapper(request) if wrapped_request.is_connect_request(): - return request + return request - if not do_deny_request_due_non_archivo_ontology_uri(wrapped_request): + if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config["restrictedAccess"]): logger.info('The requested IRI is not part of DBpedia Archivo') - return request - - response = proxy_logic(wrapped_request, self.ontoFormat, self.ontoVersion, self.disableRemovingRedirects, self.timestamp, self.manifest) + return request + + print("proxy logic") + response = proxy_logic(wrapped_request, + config["ontoFormat"], + config["ontoVersion"], + config["disableRemovingRedirects"], + config["timestamp"], + config["manifest"]) self.queue_response(response) return None - + def handle_upstream_chunk(self, chunk: memoryview): return chunk @@ -99,7 +96,7 @@ def queue_response(self, response): sys.argv = [sys.argv[0]] # check it https interception is enabled - if config[3] != 'none': + if config["httpsInterception"] != 'none': sys.argv += [ '--ca-key-file', 'ca-key.pem', '--ca-cert-file', 'ca-cert.pem', diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 93cfb73..b5b1a39 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -6,7 +6,7 @@ from ontologytimemachine.utils.utils import parse_accept_header_with_priority from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes from ontologytimemachine.utils.mock_responses import mock_response_500 -from ontologytimemachine.utils.mock_responses import mock_response_404 +from ontologytimemachine.utils.mock_responses import mock_response_404, mock_response_403 from typing import Set, Tuple @@ -18,13 +18,17 @@ def if_intercept_host(https_intercept): - if https_intercept in ['all']: + print(https_intercept) + if https_intercept in ['none', 'all']: return True + elif https_intercept in ['block']: + return False return False def do_deny_request_due_non_archivo_ontology_uri (wrapped_request, only_ontologies): if only_ontologies: + print(only_ontologies) is_archivo_ontology = is_archivo_ontology_request(wrapped_request) if not is_archivo_ontology: return True @@ -34,6 +38,7 @@ def do_deny_request_due_non_archivo_ontology_uri (wrapped_request, only_ontologi def load_archivo_urls() -> None: """Load the archivo URLs into the global variable if not already loaded.""" global ARCHIVO_PARSED_URLS + print(ARCHIVO_PARSED_URLS) if not ARCHIVO_PARSED_URLS: # Load only if the set is empty logger.info('Loading archivo ontologies from file') with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: @@ -42,6 +47,22 @@ def load_archivo_urls() -> None: } +def get_response_from_request(wrapped_request, config): + if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config["restrictedAccess"]): + logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') + return mock_response_403 + + if is_archivo_ontology_request(wrapped_request): + logger.debug('The request is for an ontology') + response = proxy_logic(wrapped_request, + config["ontoFormat"], + config["ontoVersion"], + config["disableRemovingRedirects"], + config["timestamp"], + config["manifest"]) + return response + + def is_archivo_ontology_request(wrapped_request) -> bool: """Check if the requested ontology is in the archivo.""" logger.info('Check if the requested ontology is in archivo') @@ -52,6 +73,8 @@ def is_archivo_ontology_request(wrapped_request) -> bool: # Extract the request's host and path request_host = wrapped_request.get_request().host.decode('utf-8') request_path = wrapped_request.get_request().path.decode('utf-8') + + print((request_host, request_path) in ARCHIVO_PARSED_URLS) # Check if the (host, path) tuple exists in ARCHIVO_PARSED_URLS return (request_host, request_path) in ARCHIVO_PARSED_URLS diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index 31ad83d..d83465f 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -86,7 +86,19 @@ def parse_arguments(): logger.info(f'HTTPS Interception: {args.httpsInterception}') logger.info(f'Inspect Redirects: {args.disableRemovingRedirects}') logger.info(f'Forward Headers: {args.forwardHeaders}') - return ontoFormat, args.ontoVersion, args.restrictedAccess, args.httpsInterception, args.disableRemovingRedirects, args.forwardHeaders, timestamp, manifest + + config = { + "ontoFormat": ontoFormat, + "ontoVersion": args.ontoVersion, + "restrictedAccess": args.restrictedAccess, + "httpsInterception": args.httpsInterception, + "disableRemovingRedirects": args.disableRemovingRedirects, + "forward_headers": args.forwardHeaders, + "timestamp": timestamp, + "manifest": manifest, + } + + return config def get_mime_type(format='turtle'): From 4c98e271eabc617e96b602761de53420bb297efb Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Mon, 7 Oct 2024 00:59:52 +0200 Subject: [PATCH 21/35] Create Config dataclass and some cleanups --- ontologytimemachine/custom_proxy.py | 105 ++++++------ ontologytimemachine/utils/config.py | 101 ++++++++++++ ontologytimemachine/utils/proxy_logic.py | 185 +++++++++++++--------- ontologytimemachine/utils/utils.py | 109 ++----------- tests/oldtest_integration.py | 193 ----------------------- 5 files changed, 277 insertions(+), 416 deletions(-) create mode 100644 ontologytimemachine/utils/config.py delete mode 100644 tests/oldtest_integration.py diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 41562f9..7e8ade5 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -1,77 +1,68 @@ from proxy.http.proxy import HttpProxyBasePlugin from proxy.http.parser import HttpParser from proxy.common.utils import build_http_response -from ontologytimemachine.utils.utils import parse_arguments from ontologytimemachine.utils.mock_responses import mock_response_403 from ontologytimemachine.proxy_wrapper import HttpRequestWrapper -from ontologytimemachine.utils.proxy_logic import proxy_logic, is_archivo_ontology_request -from ontologytimemachine.utils.proxy_logic import do_deny_request_due_non_archivo_ontology_uri +from ontologytimemachine.utils.proxy_logic import get_response_from_request from ontologytimemachine.utils.proxy_logic import if_intercept_host +from ontologytimemachine.utils.config import Config, parse_arguments from http.client import responses import proxy import sys import logging -IP = '0.0.0.0' -PORT = '8899' +IP = "0.0.0.0" +PORT = "8899" config = None -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + class OntologyTimeMachinePlugin(HttpProxyBasePlugin): def __init__(self, *args, **kwargs): - logger.info('Init') + logger.info("Init") super().__init__(*args, **kwargs) self.config = config def before_upstream_connection(self, request: HttpParser): - logger.info('Before upstream connection hook') - logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + logger.info("Before upstream connection hook") + logger.info( + f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" + ) wrapped_request = HttpRequestWrapper(request) if wrapped_request.is_connect_request(): - logger.info(f'HTTPS interception mode: {self.config["httpsInterception"]}') + logger.info(f"HTTPS interception mode: {self.config.httpsInterception}") + # Only intercept if interception is enabled - # Move this to the utils - if if_intercept_host(self.config["httpsInterception"]): - logger.info('HTTPS interception is on, forwardig the request') + if if_intercept_host(self.config.httpsInterception): + logger.info("HTTPS interception is on, forwardig the request") return request else: - logger.info('HTTPS interception is turned off') + logger.info("HTTPS interception is blocked") return None # # If only ontology mode, return None in all other cases - # response = get_response_from_request(wrapped_request, config) - # if response: - # self.queue_response(mock_response_403) - # return None - # return request - - def handle_client_request(self, request: HttpParser): - logger.info('Handle client request hook') - logger.info(f'Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}') + logger.info(f"Config: {self.config}") + response = get_response_from_request(wrapped_request, self.config) + if response: + self.queue_response(response) + return None - wrapped_request = HttpRequestWrapper(request) - if wrapped_request.is_connect_request(): - return request - - if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config["restrictedAccess"]): - logger.info('The requested IRI is not part of DBpedia Archivo') - return request + return request - print("proxy logic") - response = proxy_logic(wrapped_request, - config["ontoFormat"], - config["ontoVersion"], - config["disableRemovingRedirects"], - config["timestamp"], - config["manifest"]) - self.queue_response(response) + def handle_client_request(self, request: HttpParser): + logger.info("Handle client request hook") + logger.info( + f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" + ) - return None + return request def handle_upstream_chunk(self, chunk: memoryview): return chunk @@ -79,35 +70,43 @@ def handle_upstream_chunk(self, chunk: memoryview): def queue_response(self, response): self.client.queue( build_http_response( - response.status_code, - reason=bytes(responses[response.status_code], 'utf-8'), + response.status_code, + reason=bytes(responses[response.status_code], "utf-8"), headers={ - b'Content-Type': bytes(response.headers.get('Content-Type'), 'utf-8') - }, - body=response.content + b"Content-Type": bytes( + response.headers.get("Content-Type"), "utf-8" + ) + }, + body=response.content, ) ) -if __name__ == '__main__': +if __name__ == "__main__": config = parse_arguments() sys.argv = [sys.argv[0]] # check it https interception is enabled - if config["httpsInterception"] != 'none': + if config.httpsInterception != "none": sys.argv += [ - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', + "--ca-key-file", + "ca-key.pem", + "--ca-cert-file", + "ca-cert.pem", + "--ca-signing-key-file", + "ca-signing-key.pem", ] sys.argv += [ - '--hostname', IP, - '--port', PORT, - '--plugins', __name__ + '.OntologyTimeMachinePlugin' + "--hostname", + IP, + "--port", + PORT, + "--plugins", + __name__ + ".OntologyTimeMachinePlugin", ] logger.info("Starting OntologyTimeMachineProxy server...") - proxy.main() \ No newline at end of file + proxy.main() diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py new file mode 100644 index 0000000..07c64c1 --- /dev/null +++ b/ontologytimemachine/utils/config.py @@ -0,0 +1,101 @@ +import argparse +from dataclasses import dataclass +from enum import Enum +from typing import Dict, Any + + +class LogLevel(Enum): + DEBUG = 'debug' + INFO = 'info' + WARNING = 'warning' + ERROR = 'error' + + +@dataclass +class Config: + logLevel: LogLevel = LogLevel.INFO + ontoFormat: Dict[str, Any] = None + ontoVersion: str = '' + restrictedAccess: bool = False + httpsInterception: bool = False + disableRemovingRedirects: bool = False + timestamp: str = '' + #manifest: Dict[str, Any] = None + + +def parse_arguments() -> Config: + parser = argparse.ArgumentParser(description='Process ontology format and version.') + + # Defining ontoFormat argument with nested options + parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], + default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') + + parser.add_argument('--ontoPrecedence', type=str, choices=['default', 'enforcedPriority', 'always'], + default='enforcedPriority', help='Precedence of the ontology: default, enforcedPriority, always') + + parser.add_argument('--patchAcceptUpstream', type=bool, default=False, + help='Defines if the Accept Header is patched upstream in original mode.') + + # Defining ontoVersion argument + parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLiveLatest', + 'latestArchived', 'timestampArchived', 'dependencyManifest'], + default='originalFailoverLiveLatest', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') + + # Enable/disable mode to only proxy requests to ontologies + parser.add_argument('--restrictedAccess', type=bool, default=False, + help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') + + # Enable HTTPS interception for specific domains + parser.add_argument('--httpsInterception', type=str, choices=['none', 'all', 'block'], + default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') + + # Enable/disable inspecting or removing redirects + parser.add_argument('--disableRemovingRedirects', type=bool, default=False, + help='Enable/disable inspecting or removing redirects.') + + # Log level + parser.add_argument('--logLevel', type=str, default='info', + help='Level of the logging: debug, info, warning, error.') + + args = parser.parse_args() + + # Check the value of --ontoVersion and prompt for additional arguments if needed + if args.ontoVersion == 'timestampArchived': + args.timestamp = input('Please provide the timestamp (e.g., YYYY-MM-DD): ') + # Commenting manifest related code as it is not supported in the current version + # elif args.ontoVersion == 'dependencyManifest': + # args.manifest = input('Please provide the manifest file path: ') + + # Accessing the arguments + if hasattr(args, 'timestamp'): + logger.info(f"Timestamp: {args.timestamp}") + timestamp = args.timestamp + else: + timestamp = None + + # if hasattr(args, 'manifest'): + # logger.info(f"Manifest File Path: {args.manifest}") + # manifest = args.manifest + # else: + # manifest = None + + # Create ontoFormat dictionary + ontoFormat = { + 'format': args.ontoFormat, + 'precedence': args.ontoPrecedence, + 'patchAcceptUpstream': args.patchAcceptUpstream + } + + # Initialize the Config class with parsed arguments + config = Config( + logLevel=args.logLevel, + ontoFormat=ontoFormat, + ontoVersion=args.ontoVersion, + restrictedAccess=args.restrictedAccess, + httpsInterception=args.httpsInterception, + disableRemovingRedirects=args.disableRemovingRedirects, + timestamp=timestamp + #manifest=manifest + ) + + return config diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index b5b1a39..526db4c 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -2,15 +2,23 @@ import requests import rdflib from urllib.parse import urlparse -from ontologytimemachine.utils.utils import set_onto_format_headers, get_format_from_accept_header +from ontologytimemachine.utils.utils import ( + set_onto_format_headers, + get_format_from_accept_header, +) from ontologytimemachine.utils.utils import parse_accept_header_with_priority -from ontologytimemachine.utils.utils import dbpedia_api, passthrough_status_codes +from ontologytimemachine.utils.utils import archivo_api, passthrough_status_codes from ontologytimemachine.utils.mock_responses import mock_response_500 -from ontologytimemachine.utils.mock_responses import mock_response_404, mock_response_403 +from ontologytimemachine.utils.mock_responses import ( + mock_response_404, + mock_response_403, +) from typing import Set, Tuple -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) @@ -19,61 +27,66 @@ def if_intercept_host(https_intercept): print(https_intercept) - if https_intercept in ['none', 'all']: + if https_intercept in ["none", "all"]: return True - elif https_intercept in ['block']: + elif https_intercept in ["block"]: return False return False -def do_deny_request_due_non_archivo_ontology_uri (wrapped_request, only_ontologies): +def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, only_ontologies): if only_ontologies: print(only_ontologies) is_archivo_ontology = is_archivo_ontology_request(wrapped_request) if not is_archivo_ontology: return True - return False + return False -def load_archivo_urls() -> None: +def load_archivo_urls(): """Load the archivo URLs into the global variable if not already loaded.""" global ARCHIVO_PARSED_URLS print(ARCHIVO_PARSED_URLS) if not ARCHIVO_PARSED_URLS: # Load only if the set is empty - logger.info('Loading archivo ontologies from file') - with open('ontologytimemachine/utils/archivo_ontologies.txt', 'r') as file: + logger.info("Loading archivo ontologies from file") + with open("ontologytimemachine/utils/archivo_ontologies.txt", "r") as file: ARCHIVO_PARSED_URLS = { - (urlparse(line.strip()).netloc, urlparse(line.strip()).path) for line in file + (urlparse(line.strip()).netloc, urlparse(line.strip()).path) + for line in file } def get_response_from_request(wrapped_request, config): - if do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config["restrictedAccess"]): - logger.warning('Request denied: not an ontology request and only ontologies mode is enabled') + do_deny = do_deny_request_due_non_archivo_ontology_uri( + wrapped_request, config.restrictedAccess + ) + if do_deny: + logger.warning( + "Request denied: not an ontology request and only ontologies mode is enabled" + ) return mock_response_403 - - if is_archivo_ontology_request(wrapped_request): - logger.debug('The request is for an ontology') - response = proxy_logic(wrapped_request, - config["ontoFormat"], - config["ontoVersion"], - config["disableRemovingRedirects"], - config["timestamp"], - config["manifest"]) - return response + + response = proxy_logic( + wrapped_request, + config.ontoFormat, + config.ontoVersion, + config.disableRemovingRedirects, + config.timestamp, + ) + return response -def is_archivo_ontology_request(wrapped_request) -> bool: +def is_archivo_ontology_request(wrapped_request): """Check if the requested ontology is in the archivo.""" - logger.info('Check if the requested ontology is in archivo') + logger.info("Check if the requested ontology is in archivo") # Ensure the archivo URLs are loaded load_archivo_urls() # Extract the request's host and path - request_host = wrapped_request.get_request().host.decode('utf-8') - request_path = wrapped_request.get_request().path.decode('utf-8') - + request_host = wrapped_request.get_request().host.decode("utf-8") + request_path = wrapped_request.get_request().path.decode("utf-8") + print((request_host, request_path) in ARCHIVO_PARSED_URLS) # Check if the (host, path) tuple exists in ARCHIVO_PARSED_URLS @@ -83,16 +96,20 @@ def is_archivo_ontology_request(wrapped_request) -> bool: def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): allow_redirects = not disableRemovingRedirects try: - response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=5) - logger.info('Successfully fetched original ontology') + response = requests.get( + url=url, headers=headers, allow_redirects=allow_redirects, timeout=5 + ) + logger.info("Successfully fetched original ontology") return response except Exception as e: - logger.error(f'Error fetching original ontology: {e}') + logger.error(f"Error fetching original ontology: {e}") return mock_response_404() -def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp, manifest): - logger.info('Proxy has to intervene') +def proxy_logic( + wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp +): + logger.info("Proxy has to intervene") set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) @@ -102,70 +119,90 @@ def proxy_logic(wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirec # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through format = get_format_from_accept_header(headers) - if not format and ontoVersion != 'original': - logger.info(f'No format can be used from Archivo') + if not format and ontoVersion != "original": + logger.info(f"No format can be used from Archivo") return mock_response_500 - - if ontoVersion == 'original': + + if ontoVersion == "original": response = fetch_original(ontology, headers, disableRemovingRedirects) - elif ontoVersion == 'originalFailoverLiveLatest': - response = fetch_failover(ontology, headers, disableRemovingRedirects) - elif ontoVersion == 'latestArchived': - response = fetch_latest_archived(ontology, headers) - elif ontoVersion == 'timestampArchived': - response = fetch_timestamp_archived(ontology, headers, timestamp) - elif ontoVersion == 'dependencyManifest': - response = fetch_dependency_manifest(ontology, headers, manifest) + elif ontoVersion == "originalFailoverLiveLatest": + response = fetch_failover( + wrapped_request, ontology, headers, disableRemovingRedirects + ) + elif ontoVersion == "latestArchived": + response = fetch_latest_archived(wrapped_request, ontology, headers) + elif ontoVersion == "timestampArchived": + response = fetch_timestamp_archived( + wrapped_request, ontology, headers, timestamp + ) + # Commenting the manifest related part because it is not supported in the current version + # elif ontoVersion == 'dependencyManifest': + # response = fetch_dependency_manifest(ontology, headers, manifest) return response # Fetch from the original source, no matter what def fetch_original(ontology, headers, disableRemovingRedirects): - logger.info(f'Fetching original ontology from URL: {ontology}') + logger.info(f"Fetching original ontology from URL: {ontology}") return request_ontology(ontology, headers, disableRemovingRedirects) # Failover mode -def fetch_failover(ontology, headers, disableRemovingRedirects): - logger.info(f'Fetching original ontology with failover from URL: {ontology}') +def fetch_failover(wrapped_request, ontology, headers, disableRemovingRedirects): + logger.info(f"Fetching original ontology with failover from URL: {ontology}") original_response = request_ontology(ontology, headers, disableRemovingRedirects) if original_response.status_code in passthrough_status_codes: - requested_mimetypes_with_priority = parse_accept_header_with_priority(headers['Accept']) + requested_mimetypes_with_priority = parse_accept_header_with_priority( + headers["Accept"] + ) requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] - response_mime_type = original_response.headers.get('Content-Type', ';').split(';')[0] - logger.info(f'Requested mimetypes: {requested_mimetypes}') - logger.info(f'Response mimetype: {response_mime_type}') + response_mime_type = original_response.headers.get("Content-Type", ";").split( + ";" + )[0] + logger.info(f"Requested mimetypes: {requested_mimetypes}") + logger.info(f"Response mimetype: {response_mime_type}") if response_mime_type in requested_mimetypes: - return original_response + return original_response else: - logging.info(f'The returned type is not the same as the requested one') - return fetch_latest_archived(ontology, headers) + logging.info(f"The returned type is not the same as the requested one") + return fetch_latest_archived(wrapped_request, ontology, headers) else: - logger.info(f'The returend status code is not accepted: {original_response.status_code}') - return fetch_latest_archived(ontology, headers) + logger.info( + f"The returend status code is not accepted: {original_response.status_code}" + ) + return fetch_latest_archived(wrapped_request, ontology, headers) # Fetch the lates version from archivo (no timestamp defined) -def fetch_latest_archived(ontology, headers): - logger.info('Fetch latest archived') +def fetch_latest_archived(wrapped_request, ontology, headers): + if not is_archivo_ontology_request(wrapped_request): + logger.info( + "Data needs to be fetched from Archivo, but ontology is not available on Archivo." + ) + return mock_response_404() + logger.info("Fetch latest archived") format = get_format_from_accept_header(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}' - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}" + logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) - -def fetch_timestamp_archived(ontology, headers, timestamp): - logger.info('Fetch archivo timestamp') +def fetch_timestamp_archived(wrapped_request, ontology, headers, timestamp): + if not is_archivo_ontology_request(wrapped_request): + logger.info( + "Data needs to be fetched from Archivo, but ontology is not available on Archivo." + ) + return mock_response_404() + logger.info("Fetch archivo timestamp") format = get_format_from_accept_header(headers) - dbpedia_url = f'{dbpedia_api}?o={ontology}&f={format}&v={timestamp}' - logger.info(f'Fetching from DBpedia Archivo API: {dbpedia_url}') + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={timestamp}" + logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) def fetch_dependency_manifest(ontology, headers, manifest): - logger.info(f'The dependency manifest is currently not supported') + logger.info(f"The dependency manifest is currently not supported") return mock_response_500 # # Parse RDF data from the dependencies file # manifest_g = rdflib.Graph() @@ -175,24 +212,24 @@ def fetch_dependency_manifest(ontology, headers, manifest): # # Extract dependencies related to the ontology link # ontology = rdflib.URIRef(ontology) - + # dependencies = manifest_g.subjects(predicate=version_namespace.dependency, object=ontology) # for dependency in dependencies: # dep_snapshot = g.value(subject=dependency, predicate=version_namespace.snapshot) # dep_file = g.value(subject=dependency, predicate=version_namespace.file) - + # # Make request to DBpedia archive API # if dep_file: # version_param = dep_file.split('v=')[1] - # api_url = f"{dbpedia_api}?o={ontology}&v={version_param}" + # api_url = f"{archivo_api}?o={ontology}&v={version_param}" # else: - # api_url = f"{dbpedia_api}?o={ontology}" - + # api_url = f"{archivo_api}?o={ontology}" + # response = requests.get(api_url) # if response.status_code == 200: # logger.info(f"Successfully fetched {api_url}") # return response # else: # logger.error(f"Failed to fetch {api_url}, status code: {response.status_code}") - # return mock_response_404 \ No newline at end of file + # return mock_response_404 diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index d83465f..fb26e65 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -7,7 +7,7 @@ logger = logging.getLogger(__name__) -dbpedia_api = 'https://archivo.dbpedia.org/download' +archivo_api = 'https://archivo.dbpedia.org/download' archivo_mimetypes = ['application/rdf+xml', 'application/owl+xml', 'text/turtle', 'application/n-triples'] passthrough_status_codes = [ @@ -18,89 +18,6 @@ ] -def parse_arguments(): - parser = argparse.ArgumentParser(description='Process ontology format and version.') - - # Defining ontoFormat argument with nested options - parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], - default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') - - parser.add_argument('--ontoPrecedence', type=str, choices=['default', 'enforcedPriority', 'always'], - default='enforcedPriority', help='Precedence of the ontology: default, enforcedPriority, always') - - parser.add_argument('--patchAcceptUpstream', type=bool, default=False, - help='Defines if the Accept Header is patched upstream in original mode.') - - # Defining ontoVersion argument - parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLiveLatest', - 'latestArchived', 'timestampArchived', 'dependencyManifest'], - default='originalFailoverLiveLatest', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') - - # Enable/disable mode to only proxy requests to ontologies - parser.add_argument('--restrictedAccess', type=bool, default=False, - help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') - - # Enable HTTPS interception for specific domains - parser.add_argument('--httpsInterception', type=str, choices=['none', 'all', 'block'], - default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') - - # Enable/disable inspecting or removing redirects - parser.add_argument('--disableRemovingRedirects', type=bool, default=False, - help='Enable/disable inspecting or removing redirects.') - - # Enable/disable proxy forward headers - parser.add_argument('--forwardHeaders', type=bool, default=True, - help='Enable/disable proxy forward headers.') - - args = parser.parse_args() - - # Check the value of --ontoVersion and prompt for additional arguments if needed - if args.ontoVersion == 'timestampArchived': - args.timestamp = input('Please provide the timestamp (e.g., YYYY-MM-DD): ') - elif args.ontoVersion == 'dependencyManifest': - args.manifest = input('Please provide the manifest file path: ') - - # Accessing the arguments - logger.info(f"Selected Ontology Version: {args.ontoVersion}") - if hasattr(args, 'timestamp'): - logger.info(f"Timestamp: {args.timestamp}") - timestamp = args.timestamp - else: - timestamp = None - - if hasattr(args, 'manifest'): - logger.info(f"Manifest File Path: {args.manifest}") - manifest = args.manifest - else: - manifest = None - - ontoFormat = { - 'format': args.ontoFormat, - 'precedence': args.ontoPrecedence, - 'patchAcceptUpstream': args.patchAcceptUpstream - } - - logger.info(f'Ontology Format: {ontoFormat}') - logger.info(f'Ontology Version: {args.ontoVersion}') - logger.info(f'Only Ontologies Mode: {args.restrictedAccess}') - logger.info(f'HTTPS Interception: {args.httpsInterception}') - logger.info(f'Inspect Redirects: {args.disableRemovingRedirects}') - logger.info(f'Forward Headers: {args.forwardHeaders}') - - config = { - "ontoFormat": ontoFormat, - "ontoVersion": args.ontoVersion, - "restrictedAccess": args.restrictedAccess, - "httpsInterception": args.httpsInterception, - "disableRemovingRedirects": args.disableRemovingRedirects, - "forward_headers": args.forwardHeaders, - "timestamp": timestamp, - "manifest": manifest, - } - - return config - - def get_mime_type(format='turtle'): # Define a mapping of formats to MIME types format_to_mime = { @@ -114,6 +31,18 @@ def get_mime_type(format='turtle'): return format_to_mime.get(format, 'text/turtle') +def map_mime_to_format(mime_type): + # Map file extensions to formats + mime_to_format = { + 'application/rdf+xml': 'owl', # Common MIME type for OWL files + 'application/owl+xml': 'owl', # Specific MIME type for OWL + 'text/turtle': 'ttl', # MIME type for Turtle format + 'application/n-triples': 'nt', # MIME type for N-Triples format + } + + return mime_to_format.get(mime_type, None) + + def set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion): logger.info(f'Setting headers based on ontoFormat: {ontoFormat} and ontoVersion: {ontoVersion}') @@ -159,18 +88,6 @@ def select_highest_priority_mime_from_archivo(mime_list): return None -def map_mime_to_format(mime_type): - # Map file extensions to formats - mime_to_format = { - 'application/rdf+xml': 'owl', # Common MIME type for OWL files - 'application/owl+xml': 'owl', # Specific MIME type for OWL - 'text/turtle': 'ttl', # MIME type for Turtle format - 'application/n-triples': 'nt', # MIME type for N-Triples format - } - - return mime_to_format.get(mime_type, None) - - def parse_accept_header_with_priority(accept_header): logger.info('Parse accept header') # Parse the Accept header to extract MIME types and their priority (q values) diff --git a/tests/oldtest_integration.py b/tests/oldtest_integration.py deleted file mode 100644 index d5d8b8e..0000000 --- a/tests/oldtest_integration.py +++ /dev/null @@ -1,193 +0,0 @@ -import pytest -import requests -import time -import subprocess -import itertools -from ontologytimemachine.custom_proxy import IP, PORT - - -PROXY = f'{IP}:{PORT}' -HTTP_PROXY = f'http://{PROXY}' -HTTPS_PROXY = f'http://{PROXY}' -PROXIES = { - "http": HTTP_PROXY, - "https": HTTPS_PROXY -} -CA_CERT_PATH = "ca-cert.pem" - - -@pytest.fixture(scope="module", autouse=True) -def start_proxy_server(): - # Start the proxy server in a subprocess - process = subprocess.Popen( - [ - 'python3', '-m', 'proxy', - '--ca-key-file', 'ca-key.pem', - '--ca-cert-file', 'ca-cert.pem', - '--ca-signing-key-file', 'ca-signing-key.pem', - '--hostname', IP, - '--port', PORT, - '--plugins', 'ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin' - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE - ) - - # Wait a bit to ensure the server starts - time.sleep(5) - - yield - "http://0.0.0.0:8899" - # Terminate the proxy server after tests - process.terminate() - process.wait() - - -def test_babelnet(): - iri = 'http://babelnet.org/rdf/' - generic_test(iri, 'text/turtle') - - -def test_bag_basisregistraties(): - iri = 'http://bag.basisregistraties.overheid.nl/def/bag' - generic_test(iri, 'text/turtle') - - -def test_bblfish(): - iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' - generic_test(iri, 'text/turtle') - - -def test_brk_basisregistraties(): - iri = 'http://brk.basisregistraties.overheid.nl/def/brk' - generic_test(iri, 'text/turtle') - - -def test_brt_basisregistraties(): - iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' - generic_test(iri, 'text/turtle') - - -def test_brt_basisregistraties_begrippenkader(): - iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' - generic_test(iri, 'text/turtle') - - -def test_buzzword(): - iri = 'http://buzzword.org.uk/rdf/personal-link-types#' - generic_test(iri, 'text/turtle') - - -def test_catalogus_professorum(): - iri = 'http://catalogus-professorum.org/cpm/2/' - generic_test(iri, 'text/turtle') - - -def test_data_gov(): - iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' - generic_test(iri, 'text/turtle') - - -def test_data_bigdatagrapes(): - iri = 'http://data.bigdatagrapes.eu/resource/ontology/' - generic_test(iri, 'text/turtle') - - -def test_data_europa_esco(): - iri = 'http://data.europa.eu/esco/flow' - generic_test(iri, 'text/turtle') - - -def test_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') - - -def test_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') - - -def test_data_opendiscoveryspace(): - iri = 'http://data.opendiscoveryspace.eu/lom_ontology_ods.owl#' - generic_test(iri, 'text/turtle') - - -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://data.ordnancesurvey.co.uk/ontology/50kGazetteer/' - generic_test(iri, 'text/turtle') - - -def test_data_ordnancesurvey_50kGazetteer(): - iri = 'http://dbpedia.org/ontology/Person' - generic_test(iri, 'text/turtle') - - -def test_linked_web_apis(): - iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' - generic_test(iri, 'text/turtle') - - -#def test_ontologi_es(): -# iri = 'http://ontologi.es/days#' -# generic_test(iri, 'text/turtle') - - -def test_https(): - iri = "https://www.w3id.org/simulation/ontology/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def test_https(): - iri = "https://vocab.eccenca.com/auth/" - generic_test(iri, 'text/plain; charset=utf-8') - - -def not_test_all_iris(): - with open('tests/archivo_ontologies_test.txt', 'r') as file: - for line in file: - iri = line.strip() - if iri: # Ensure it's not an empty line - iri_generic_test(iri) - - -def generic_test(iri, content_type): - response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) - assert response.status_code == 200 - assert iri in response.content.decode('utf-8') - - -def iri_generic_test(iri): - try: - response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) - assert response.status_code == 200 - assert iri in response.content.decode('utf-8') - print(f"Test passed for IRI: {iri}") - except AssertionError: - print(f"Test failed for IRI: {iri}") - except requests.exceptions.RequestException as e: - print(f"Request failed for IRI: {iri}, Error: {e}") - - -def get_parameter_combinations(): -# Define the possible values for each parameter - ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] - ontoPrecedence = ['default', 'enforcedPriority', 'always'] - patchAcceptUpstream = [True, False] - ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', - 'latestArchive', 'timestampArchive', 'dependencyManifest'] - onlyOntologies = [True, False] - httpsIntercept = [True, False] - inspectRedirects = [True, False] - forwardHeaders = [True, False] - subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] - - combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, - onlyOntologies, httpsIntercept, inspectRedirects, - forwardHeaders, subjectBinarySearchThreshold)) - return combinations - - -if __name__ == '__main__': - - pytest.main() From b284e7437b60355ed2e01fdf8a7bb107879dc9f5 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Mon, 7 Oct 2024 01:23:16 +0200 Subject: [PATCH 22/35] Add testcases --- ontologytimemachine/utils/config.py | 112 +++++++++++++++-------- tests/test_config.py | 36 ++++++++ tests/test_mock_responses.py | 34 +++++++ tests/test_proxy_logic.py | 41 +++++++++ tests/test_unit.py | 136 ---------------------------- tests/test_utils.py | 103 +++++++++++++++++++++ 6 files changed, 290 insertions(+), 172 deletions(-) create mode 100644 tests/test_config.py create mode 100644 tests/test_mock_responses.py create mode 100644 tests/test_proxy_logic.py delete mode 100644 tests/test_unit.py create mode 100644 tests/test_utils.py diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 07c64c1..830511f 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -5,74 +5,114 @@ class LogLevel(Enum): - DEBUG = 'debug' - INFO = 'info' - WARNING = 'warning' - ERROR = 'error' + DEBUG = "debug" + INFO = "info" + WARNING = "warning" + ERROR = "error" @dataclass class Config: logLevel: LogLevel = LogLevel.INFO ontoFormat: Dict[str, Any] = None - ontoVersion: str = '' + ontoVersion: str = "" restrictedAccess: bool = False httpsInterception: bool = False disableRemovingRedirects: bool = False - timestamp: str = '' - #manifest: Dict[str, Any] = None + timestamp: str = "" + # manifest: Dict[str, Any] = None def parse_arguments() -> Config: - parser = argparse.ArgumentParser(description='Process ontology format and version.') + parser = argparse.ArgumentParser(description="Process ontology format and version.") # Defining ontoFormat argument with nested options - parser.add_argument('--ontoFormat', type=str, choices=['turtle', 'ntriples', 'rdfxml', 'htmldocu'], - default='turtle', help='Format of the ontology: turtle, ntriples, rdfxml, htmldocu') + parser.add_argument( + "--ontoFormat", + type=str, + choices=["turtle", "ntriples", "rdfxml", "htmldocu"], + default="turtle", + help="Format of the ontology: turtle, ntriples, rdfxml, htmldocu", + ) - parser.add_argument('--ontoPrecedence', type=str, choices=['default', 'enforcedPriority', 'always'], - default='enforcedPriority', help='Precedence of the ontology: default, enforcedPriority, always') + parser.add_argument( + "--ontoPrecedence", + type=str, + choices=["default", "enforcedPriority", "always"], + default="enforcedPriority", + help="Precedence of the ontology: default, enforcedPriority, always", + ) - parser.add_argument('--patchAcceptUpstream', type=bool, default=False, - help='Defines if the Accept Header is patched upstream in original mode.') + parser.add_argument( + "--patchAcceptUpstream", + type=bool, + default=False, + help="Defines if the Accept Header is patched upstream in original mode.", + ) # Defining ontoVersion argument - parser.add_argument('--ontoVersion', type=str, choices=['original', 'originalFailoverLiveLatest', - 'latestArchived', 'timestampArchived', 'dependencyManifest'], - default='originalFailoverLiveLatest', help='Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest') + parser.add_argument( + "--ontoVersion", + type=str, + choices=[ + "original", + "originalFailoverLiveLatest", + "latestArchived", + "timestampArchived", + "dependencyManifest", + ], + default="originalFailoverLiveLatest", + help="Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest", + ) # Enable/disable mode to only proxy requests to ontologies - parser.add_argument('--restrictedAccess', type=bool, default=False, - help='Enable/disable mode to only proxy requests to ontologies stored in Archivo.') + parser.add_argument( + "--restrictedAccess", + type=bool, + default=False, + help="Enable/disable mode to only proxy requests to ontologies stored in Archivo.", + ) # Enable HTTPS interception for specific domains - parser.add_argument('--httpsInterception', type=str, choices=['none', 'all', 'block'], - default='all', help='Enable HTTPS interception for specific domains: none, archivo, all, listfilename.') + parser.add_argument( + "--httpsInterception", + type=str, + choices=["none", "all", "block"], + default="all", + help="Enable HTTPS interception for specific domains: none, archivo, all, listfilename.", + ) # Enable/disable inspecting or removing redirects - parser.add_argument('--disableRemovingRedirects', type=bool, default=False, - help='Enable/disable inspecting or removing redirects.') - + parser.add_argument( + "--disableRemovingRedirects", + type=bool, + default=False, + help="Enable/disable inspecting or removing redirects.", + ) + # Log level - parser.add_argument('--logLevel', type=str, default='info', - help='Level of the logging: debug, info, warning, error.') + parser.add_argument( + "--logLevel", + type=str, + default="info", + help="Level of the logging: debug, info, warning, error.", + ) args = parser.parse_args() # Check the value of --ontoVersion and prompt for additional arguments if needed - if args.ontoVersion == 'timestampArchived': - args.timestamp = input('Please provide the timestamp (e.g., YYYY-MM-DD): ') + if args.ontoVersion == "timestampArchived": + args.timestamp = input("Please provide the timestamp (e.g., YYYY-MM-DD): ") # Commenting manifest related code as it is not supported in the current version # elif args.ontoVersion == 'dependencyManifest': # args.manifest = input('Please provide the manifest file path: ') # Accessing the arguments - if hasattr(args, 'timestamp'): - logger.info(f"Timestamp: {args.timestamp}") + if hasattr(args, "timestamp"): timestamp = args.timestamp else: timestamp = None - + # if hasattr(args, 'manifest'): # logger.info(f"Manifest File Path: {args.manifest}") # manifest = args.manifest @@ -81,9 +121,9 @@ def parse_arguments() -> Config: # Create ontoFormat dictionary ontoFormat = { - 'format': args.ontoFormat, - 'precedence': args.ontoPrecedence, - 'patchAcceptUpstream': args.patchAcceptUpstream + "format": args.ontoFormat, + "precedence": args.ontoPrecedence, + "patchAcceptUpstream": args.patchAcceptUpstream, } # Initialize the Config class with parsed arguments @@ -94,8 +134,8 @@ def parse_arguments() -> Config: restrictedAccess=args.restrictedAccess, httpsInterception=args.httpsInterception, disableRemovingRedirects=args.disableRemovingRedirects, - timestamp=timestamp - #manifest=manifest + timestamp=timestamp, + # manifest=manifest ) return config diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..8d1db7e --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,36 @@ +import unittest +from ontologytimemachine.utils.config import parse_arguments, Config +import sys + + +class TestConfig(unittest.TestCase): + + def test_parse_arguments(self): + test_args = [ + "test", + "--ontoFormat", + "turtle", + "--ontoPrecedence", + "enforcedPriority", + "--patchAcceptUpstream", + "False", + "--ontoVersion", + "original", + "--httpsInterception", + "none", + "--disableRemovingRedirects", + "False", + "--logLevel", + "info", + ] + sys.argv = test_args + config = parse_arguments() + self.assertIsInstance(config, Config) + self.assertEqual(config.ontoFormat["format"], "turtle") + self.assertEqual(config.ontoVersion, "original") + self.assertEqual(config.restrictedAccess, False) + self.assertEqual(config.httpsInterception, "none") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_mock_responses.py b/tests/test_mock_responses.py new file mode 100644 index 0000000..a145d60 --- /dev/null +++ b/tests/test_mock_responses.py @@ -0,0 +1,34 @@ +import unittest +from ontologytimemachine.utils.mock_responses import ( + mock_response_200, + mock_response_403, + mock_response_404, + mock_response_500, +) + + +class TestMockResponses(unittest.TestCase): + + def test_mock_response_200(self): + response = mock_response_200() + self.assertEqual(response.status_code, 200) + self.assertIn("

To be implemented

", response.text) + + def test_mock_response_403(self): + response = mock_response_403() + self.assertEqual(response.status_code, 403) + self.assertIn("403 Forbidden", response.text) + + def test_mock_response_404(self): + response = mock_response_404() + self.assertEqual(response.status_code, 404) + self.assertIn("404 Not Found", response.text) + + def test_mock_response_500(self): + response = mock_response_500() + self.assertEqual(response.status_code, 500) + self.assertIn("500 Internal Server Error", response.text) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_proxy_logic.py b/tests/test_proxy_logic.py new file mode 100644 index 0000000..daf3701 --- /dev/null +++ b/tests/test_proxy_logic.py @@ -0,0 +1,41 @@ +import unittest +from ontologytimemachine.utils.proxy_logic import ( + if_intercept_host, + do_deny_request_due_non_archivo_ontology_uri, + load_archivo_urls, + is_archivo_ontology_request, + proxy_logic, + fetch_original, +) + + +class TestProxyLogic(unittest.TestCase): + + def test_if_intercept_host(self): + self.assertTrue(if_intercept_host("all")) + self.assertFalse(if_intercept_host("block")) + self.assertTrue(if_intercept_host("none")) + + def test_do_deny_request_due_non_archivo_ontology_uri(self): + # Assuming we are using some sample data structure + class WrappedRequest: + def __init__(self, host, path): + self.host = host + self.path = path + + def get_request(self): + return self + + request = WrappedRequest(b"example.com", b"/ontology") + self.assertTrue(do_deny_request_due_non_archivo_ontology_uri(request, True)) + self.assertFalse(do_deny_request_due_non_archivo_ontology_uri(request, False)) + + def test_fetch_original(self): + url = "https://example.com" + headers = {"Accept": "text/html"} + response = fetch_original(url, headers, False) + self.assertEqual(response.status_code, 200) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_unit.py b/tests/test_unit.py deleted file mode 100644 index 9cd7856..0000000 --- a/tests/test_unit.py +++ /dev/null @@ -1,136 +0,0 @@ -import unittest -from unittest.mock import patch, Mock -import argparse -import requests - -from ontologytimemachine.utils.mock_responses import ( - mock_response_200, - mock_response_403, - mock_response_404, - mock_response_500 -) -from ontologytimemachine.utils.utils import ( - parse_arguments, - map_mime_to_format, - get_format_from_accept_header -) - -from ontologytimemachine.utils.proxy_logic import ( - fetch_latest_archived -) - -class TestUtils(unittest.TestCase): - - @patch('argparse.ArgumentParser.parse_args') - def test_parse_arguments(self, mock_parse_args): - mock_parse_args.return_value = argparse.Namespace( - ontoFormat='turtle', - ontoPrecedence='enforcedPriority', - patchAcceptUpstream=False, - ontoVersion='originalFailoverLive', - restrictedAccess=True, - httpsInterception=False, - disableRemovingRedirects=True, - forwardHeaders=True - ) - - args = parse_arguments() - - self.assertEqual(args[0]['format'], 'turtle') - self.assertEqual(args[0]['precedence'], 'enforcedPriority') - self.assertFalse(args[0]['patchAcceptUpstream']) - self.assertEqual(args[1], 'originalFailoverLive') - self.assertTrue(args[2]) - self.assertFalse(args[3]) - self.assertTrue(args[4]) - self.assertTrue(args[5]) - - mock_parse_args.return_value = argparse.Namespace( - ontoFormat='ntriples', - ontoPrecedence='default', - patchAcceptUpstream=True, - ontoVersion='latestArchive', - restrictedAccess=False, - httpsInterception=True, - disableRemovingRedirects=False, - forwardHeaders=False - ) - - args = parse_arguments() - - self.assertEqual(args[0]['format'], 'ntriples') - self.assertEqual(args[0]['precedence'], 'default') - self.assertTrue(args[0]['patchAcceptUpstream']) - self.assertEqual(args[1], 'latestArchive') - self.assertFalse(args[2]) - self.assertTrue(args[3]) - self.assertFalse(args[4]) - self.assertFalse(args[5]) - - - @patch('requests.get') - def test_fetch_latest_archived(self, mock_get): - mock_response = Mock() - mock_response.status_code = 200 - mock_get.return_value = mock_response - - ontology = 'http://dbpedia.org/ontology/Person' - headers = {'Accept': 'text/turtle'} - - response = fetch_latest_archived(ontology, headers) - self.assertEqual(response.status_code, 200) - - mock_get.side_effect = requests.exceptions.RequestException - response = fetch_latest_archived(ontology, headers) - self.assertEqual(response.status_code, 404) - - def test_map_mime_to_format(self): - self.assertEqual(map_mime_to_format('application/rdf+xml'), 'owl') - self.assertEqual(map_mime_to_format('text/turtle'), 'ttl') - self.assertEqual(map_mime_to_format('application/n-triples'), 'nt') - self.assertIsNone(map_mime_to_format('unknown/mime')) - - def test_get_format_from_accept_header(self): - headers = {'Accept': 'application/json'} - format = get_format_from_accept_header(headers) - self.assertEqual(format, None) - - headers = {} - format = get_format_from_accept_header(headers) - - self.assertIsNone(format, None) - - headers = {'Accept': 'text/turtle'} - format = get_format_from_accept_header(headers) - self.assertEqual(format, 'ttl') - - -class TestMockResponses(unittest.TestCase): - - def test_mock_response_200(self): - response = mock_response_200() - self.assertEqual(response.status_code, 200) - self.assertEqual(response.url, 'https://example.com/success') - self.assertEqual(response.headers['Content-Type'], 'text/html') - self.assertIn(b'

To be implemented

', response.content) - - def test_mock_response_403(self): - response = mock_response_403() - self.assertEqual(response.status_code, 403) - self.assertEqual(response.url, 'https://example.com/forbidden') - self.assertEqual(response.headers['Content-Type'], 'text/html') - self.assertIn(b'

403 Forbidden

', response.content) - - def test_mock_response_404(self): - response = mock_response_404() - self.assertEqual(response.status_code, 404) - self.assertEqual(response.url, 'https://example.com/resource-not-found') - self.assertEqual(response.headers['Content-Type'], 'text/html') - self.assertIn(b'

404 Not Found

', response.content) - - def test_mock_response_500(self): - response = mock_response_500() - self.assertEqual(response.status_code, 500) - self.assertEqual(response.url, 'https://example.com/internal-server-error') - self.assertEqual(response.headers['Content-Type'], 'text/html') - self.assertIn(b'

500 Internal Server Error

', response.content) \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..60cc702 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,103 @@ +import unittest +from unittest.mock import patch, Mock +import argparse +import requests + +from ontologytimemachine.utils.utils import ( + get_mime_type, + map_mime_to_format, + get_format_from_accept_header, + select_highest_priority_mime_from_archivo, + parse_accept_header_with_priority, + set_onto_format_headers, +) + + +class TestUtils(unittest.TestCase): + + def test_get_mime_type(self): + self.assertEqual(get_mime_type("turtle"), "text/turtle") + self.assertEqual(get_mime_type("rdfxml"), "application/rdf+xml") + self.assertEqual(get_mime_type("ntriples"), "application/n-triples") + self.assertEqual(get_mime_type("htmldocu"), "text/html") + self.assertEqual(get_mime_type("unknown"), "text/turtle") # Default + + def test_map_mime_to_format(self): + self.assertEqual(map_mime_to_format("application/rdf+xml"), "owl") + self.assertEqual(map_mime_to_format("application/owl+xml"), "owl") + self.assertEqual(map_mime_to_format("text/turtle"), "ttl") + self.assertEqual(map_mime_to_format("application/n-triples"), "nt") + self.assertIsNone(map_mime_to_format("unknown/mime")) + + def test_select_highest_priority_mime_from_archivo(self): + archivo_mime_types = [ + ("application/rdf+xml", 1.0), + ("text/turtle", 0.8), + ("application/n-triples", 1.0), + ] + result = select_highest_priority_mime_from_archivo(archivo_mime_types) + self.assertEqual(result, "application/rdf+xml") + + archivo_mime_types = [ + ("text/html", 0.8), # Unsupported type + ] + result = select_highest_priority_mime_from_archivo(archivo_mime_types) + self.assertIsNone(result) + + def test_parse_accept_header_with_priority(self): + accept_header = ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" + ) + parsed_result = parse_accept_header_with_priority(accept_header) + expected_result = [ + ("text/html", 1), + ("application/xhtml+xml", 1), + ("image/webp", 1), + ("application/xml", 0.9), + ("*/*", 0.8), + ] + print(parsed_result) + print(expected_result) + self.assertEqual(parsed_result, expected_result) + + def test_get_format_from_accept_header(self): + headers = {"Accept": "application/rdf+xml,text/turtle;q=0.9,*/*;q=0.8"} + format_result = get_format_from_accept_header(headers) + self.assertEqual(format_result, "owl") + + headers_empty = {} + format_result = get_format_from_accept_header(headers_empty) + self.assertIsNone(format_result) + + @patch("requests.get") + def test_fetch_latest_archived(self, mock_get): + mock_response = Mock() + mock_response.status_code = 200 + mock_get.return_value = mock_response + + ontology = "http://dbpedia.org/ontology/Person" + headers = {"Accept": "text/turtle"} + + def test_map_mime_to_format(self): + self.assertEqual(map_mime_to_format("application/rdf+xml"), "owl") + self.assertEqual(map_mime_to_format("text/turtle"), "ttl") + self.assertEqual(map_mime_to_format("application/n-triples"), "nt") + self.assertIsNone(map_mime_to_format("unknown/mime")) + + def test_get_format_from_accept_header(self): + headers = {"Accept": "application/json"} + format = get_format_from_accept_header(headers) + self.assertEqual(format, None) + + headers = {} + format = get_format_from_accept_header(headers) + + self.assertIsNone(format, None) + + headers = {"Accept": "text/turtle"} + format = get_format_from_accept_header(headers) + self.assertEqual(format, "ttl") + + +if __name__ == "__main__": + unittest.main() From b4057d4bd4dfca131ab10edd55e477859caafc1b Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 09:00:36 +0200 Subject: [PATCH 23/35] fix startup command in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f335563..39e9ecb 100644 --- a/README.md +++ b/README.md @@ -46,4 +46,4 @@ poetry install poetry shell ``` -python3 -m proxy --ca-key-file ca-key.pem --ca-cert-file ca-cert.pem --ca-signing-key-file ca-signing-key.pem --hostname IP --port 8899 --plugins ontologytimemachine.custom_proxy.OntologyTimeMachinePlugin --ontoFormat ntriples --ontoVersion originalFailoverLive --ontoPrecedence enforcedPriority \ No newline at end of file +python3 ontologytimemachine/custom_proxy.py --ontoFormat ntriples --ontoVersion originalFailoverLiveLatest --ontoPrecedence enforcedPriority \ No newline at end of file From 945a6fed41dcc4f6e5254d5f7681718cbe44eb14 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 09:45:47 +0200 Subject: [PATCH 24/35] modify wrapper function for get_request host and path --- ontologytimemachine/proxy_wrapper.py | 57 +++++++++++++++++----------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index 0e897ea..785d604 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -4,7 +4,9 @@ from typing import Tuple, Dict, Any # Configure logger -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) @@ -29,7 +31,11 @@ def is_https_request(self) -> bool: pass @abstractmethod - def get_request(self) -> Any: + def get_request_host(self) -> Any: + pass + + @abstractmethod + def get_request_path(self) -> Any: pass @abstractmethod @@ -54,46 +60,51 @@ def __init__(self, request: HttpParser) -> None: super().__init__(request) def is_get_request(self) -> bool: - return self.request.method == b'GET' + return self.request.method == b"GET" def is_connect_request(self) -> bool: - return self.request.method == b'CONNECT' + return self.request.method == b"CONNECT" def is_head_request(self) -> bool: - return self.request.method == b'HEAD' + return self.request.method == b"HEAD" def is_https_request(self) -> bool: - return self.request.method == b'CONNECT' or self.request.headers.get(b'Host', b'').startswith(b'https') + return self.request.method == b"CONNECT" or self.request.headers.get( + b"Host", b"" + ).startswith(b"https") - def get_request(self) -> HttpParser: - return self.request + def get_request_host(self) -> str: + return self.request.host.decode("utf-8") + + def get_request_path(self) -> str: + return self.request.host.decode("utf-8") def get_request_headers(self) -> Dict[str, str]: headers: Dict[str, str] = {} for k, v in self.request.headers.items(): - headers[v[0].decode('utf-8')] = v[1].decode('utf-8') + headers[v[0].decode("utf-8")] = v[1].decode("utf-8") return headers def get_request_accept_header(self) -> str: - logger.info('Wrapper - get_request_accept_header') - return self.request.headers[b'accept'][1].decode('utf-8') - + logger.info("Wrapper - get_request_accept_header") + return self.request.headers[b"accept"][1].decode("utf-8") + def set_request_accept_header(self, mime_type: str) -> None: - self.request.headers[b'accept'] = (b'Accept', mime_type.encode('utf-8')) + self.request.headers[b"accept"] = (b"Accept", mime_type.encode("utf-8")) logger.info(f'Accept header set to: {self.request.headers[b"accept"][1]}') - + def get_request_url_host_path(self) -> Tuple[str, str, str]: - logger.info('Get ontology from request') - if (self.request.method in {b'GET', b'HEAD'}) and not self.request.host: + logger.info("Get ontology from request") + if (self.request.method in {b"GET", b"HEAD"}) and not self.request.host: for k, v in self.request.headers.items(): - if v[0].decode('utf-8') == 'Host': - host = v[1].decode('utf-8') - path = self.request.path.decode('utf-8') - url = f'https://{host}{path}' + if v[0].decode("utf-8") == "Host": + host = v[1].decode("utf-8") + path = self.request.path.decode("utf-8") + url = f"https://{host}{path}" else: - host = self.request.host.decode('utf-8') - path = self.request.path.decode('utf-8') + host = self.request.host.decode("utf-8") + path = self.request.path.decode("utf-8") url = str(self.request._url) - logger.info(f'Ontology: {url}') + logger.info(f"Ontology: {url}") return url, host, path From 5f00b2771d694786beee4ec54768ff5dd119840d Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:28:08 +0200 Subject: [PATCH 25/35] use enum for config --- ontologytimemachine/utils/config.py | 74 ++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 23 deletions(-) diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 830511f..31e3cf1 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -11,35 +11,72 @@ class LogLevel(Enum): ERROR = "error" +class OntoFormat(Enum): + TURTLE = "turtle" + NTRIPLES = "ntriples" + RDFXML = "rdfxml" + HTMLDOCU = "htmldocu" + + +class OntoPrecedence(Enum): + DEFAULT = "default" + ENFORCED_PRIORITY = "enforcedPriority" + ALWAYS = "always" + + +class OntoVersion(Enum): + ORIGINAL = "original" + ORIGINAL_FAILOVER_LIVE_LATEST = "originalFailoverLiveLatest" + LATEST_ARCHIVED = "latestArchived" + TIMESTAMP_ARCHIVED = "timestampArchived" + DEPENDENCY_MANIFEST = "dependencyManifest" + + +class HttpsInterception(Enum): + NONE = "none" + ALL = "all" + BLOCK = "block" + ARCHIVO = "archivo" + + @dataclass class Config: logLevel: LogLevel = LogLevel.INFO ontoFormat: Dict[str, Any] = None ontoVersion: str = "" restrictedAccess: bool = False - httpsInterception: bool = False + httpsInterception: str = False disableRemovingRedirects: bool = False timestamp: str = "" # manifest: Dict[str, Any] = None +def enum_parser(enum_class, value): + value_lower = value.lower() + try: + return next(e.value for e in enum_class if e.value.lower() == value_lower) + except StopIteration: + valid_options = ", ".join([e.value for e in enum_class]) + raise ValueError( + f"Invalid value '{value}'. Available options are: {valid_options}" + ) + + def parse_arguments() -> Config: parser = argparse.ArgumentParser(description="Process ontology format and version.") # Defining ontoFormat argument with nested options parser.add_argument( "--ontoFormat", - type=str, - choices=["turtle", "ntriples", "rdfxml", "htmldocu"], - default="turtle", + type=lambda s: enum_parser(OntoFormat, s), + default=OntoFormat.TURTLE.value, help="Format of the ontology: turtle, ntriples, rdfxml, htmldocu", ) parser.add_argument( "--ontoPrecedence", - type=str, - choices=["default", "enforcedPriority", "always"], - default="enforcedPriority", + type=lambda s: enum_parser(OntoPrecedence, s), + default=OntoPrecedence.ENFORCED_PRIORITY.value, help="Precedence of the ontology: default, enforcedPriority, always", ) @@ -53,15 +90,8 @@ def parse_arguments() -> Config: # Defining ontoVersion argument parser.add_argument( "--ontoVersion", - type=str, - choices=[ - "original", - "originalFailoverLiveLatest", - "latestArchived", - "timestampArchived", - "dependencyManifest", - ], - default="originalFailoverLiveLatest", + type=lambda s: enum_parser(OntoVersion, s), + default=OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST.value, help="Version of the ontology: original, originalFailoverLive, originalFailoverArchivoMonitor, latestArchive, timestampArchive, dependencyManifest", ) @@ -76,9 +106,8 @@ def parse_arguments() -> Config: # Enable HTTPS interception for specific domains parser.add_argument( "--httpsInterception", - type=str, - choices=["none", "all", "block"], - default="all", + type=lambda s: enum_parser(HttpsInterception, s), + default=HttpsInterception.ALL.value, help="Enable HTTPS interception for specific domains: none, archivo, all, listfilename.", ) @@ -93,8 +122,8 @@ def parse_arguments() -> Config: # Log level parser.add_argument( "--logLevel", - type=str, - default="info", + type=lambda s: enum_parser(LogLevel, s), + default=LogLevel.INFO.value, help="Level of the logging: debug, info, warning, error.", ) @@ -134,8 +163,7 @@ def parse_arguments() -> Config: restrictedAccess=args.restrictedAccess, httpsInterception=args.httpsInterception, disableRemovingRedirects=args.disableRemovingRedirects, - timestamp=timestamp, - # manifest=manifest + timestamp=args.timestamp if hasattr(args, "timestamp") else "", ) return config From 2110f4db181c706d7279e9ae65de32b9d82265ca Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:31:02 +0200 Subject: [PATCH 26/35] use enum for config --- ontologytimemachine/utils/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 31e3cf1..6b95861 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -43,9 +43,9 @@ class HttpsInterception(Enum): class Config: logLevel: LogLevel = LogLevel.INFO ontoFormat: Dict[str, Any] = None - ontoVersion: str = "" + ontoVersion: OntoVersion = (OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST,) restrictedAccess: bool = False - httpsInterception: str = False + httpsInterception: HttpsInterception = (HttpsInterception.ARCHIVO,) disableRemovingRedirects: bool = False timestamp: str = "" # manifest: Dict[str, Any] = None From af082eb9a8e97180c04dcd4559696bab4cb83730 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:33:08 +0200 Subject: [PATCH 27/35] update proxy logic function def and add do_intercept hook --- ontologytimemachine/utils/proxy_logic.py | 112 ++++++++++++----------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 526db4c..c479547 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -1,17 +1,19 @@ import logging import requests -import rdflib -from urllib.parse import urlparse from ontologytimemachine.utils.utils import ( set_onto_format_headers, get_format_from_accept_header, ) -from ontologytimemachine.utils.utils import parse_accept_header_with_priority -from ontologytimemachine.utils.utils import archivo_api, passthrough_status_codes -from ontologytimemachine.utils.mock_responses import mock_response_500 +from ontologytimemachine.utils.download_archivo_urls import load_archivo_urls +from ontologytimemachine.utils.utils import ( + parse_accept_header_with_priority, + archivo_api, + passthrough_status_codes, +) from ontologytimemachine.utils.mock_responses import ( - mock_response_404, mock_response_403, + mock_response_404, + mock_response_500, ) from typing import Set, Tuple @@ -22,14 +24,10 @@ logger = logging.getLogger(__name__) -ARCHIVO_PARSED_URLS: Set[Tuple[str, str]] = set() - - -def if_intercept_host(https_intercept): - print(https_intercept) - if https_intercept in ["none", "all"]: +def if_intercept_host(config): + if config.httpsInterception in ["none", "all"]: return True - elif https_intercept in ["block"]: + elif config.httpsInterception in ["block"]: return False return False @@ -43,19 +41,6 @@ def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, only_ontologie return False -def load_archivo_urls(): - """Load the archivo URLs into the global variable if not already loaded.""" - global ARCHIVO_PARSED_URLS - print(ARCHIVO_PARSED_URLS) - if not ARCHIVO_PARSED_URLS: # Load only if the set is empty - logger.info("Loading archivo ontologies from file") - with open("ontologytimemachine/utils/archivo_ontologies.txt", "r") as file: - ARCHIVO_PARSED_URLS = { - (urlparse(line.strip()).netloc, urlparse(line.strip()).path) - for line in file - } - - def get_response_from_request(wrapped_request, config): do_deny = do_deny_request_due_non_archivo_ontology_uri( wrapped_request, config.restrictedAccess @@ -66,13 +51,7 @@ def get_response_from_request(wrapped_request, config): ) return mock_response_403 - response = proxy_logic( - wrapped_request, - config.ontoFormat, - config.ontoVersion, - config.disableRemovingRedirects, - config.timestamp, - ) + response = proxy_logic(wrapped_request, config) return response @@ -82,15 +61,43 @@ def is_archivo_ontology_request(wrapped_request): # Ensure the archivo URLs are loaded load_archivo_urls() + from ontologytimemachine.utils.download_archivo_urls import ARCHIVO_PARSED_URLS # Extract the request's host and path - request_host = wrapped_request.get_request().host.decode("utf-8") - request_path = wrapped_request.get_request().path.decode("utf-8") + request_host = wrapped_request.get_request_host() + request_path = wrapped_request.get_request_path() + + print(f"Host: {request_host}") + print(f"Path: {request_path}") + print((request_host, request_path)) + print(list(ARCHIVO_PARSED_URLS)[0]) + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True - print((request_host, request_path) in ARCHIVO_PARSED_URLS) + # Remove last hash and check again + if request_path.endswith("/"): + request_path = request_path.rstrip("/") + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True - # Check if the (host, path) tuple exists in ARCHIVO_PARSED_URLS - return (request_host, request_path) in ARCHIVO_PARSED_URLS + # Cut the last part of the path + + path_parts = request_path.split("/") + new_path = "/".join(path_parts[:-1]) + print(f"New path: {new_path}") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True + + new_path = "/".join(path_parts[:-2]) + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True + + logger.info(f"Requested URL: {request_host+request_path} is NOT in Archivo") + return False def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): @@ -106,12 +113,11 @@ def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): return mock_response_404() -def proxy_logic( - wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp -): +# change the function definition and pass only the config +def proxy_logic(wrapped_request, config): logger.info("Proxy has to intervene") - set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) + set_onto_format_headers(wrapped_request, config) headers = wrapped_request.get_request_headers() ontology, _, _ = wrapped_request.get_request_url_host_path() @@ -119,22 +125,20 @@ def proxy_logic( # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through format = get_format_from_accept_header(headers) - if not format and ontoVersion != "original": + if not format and config.ontoVersion != "original": logger.info(f"No format can be used from Archivo") return mock_response_500 - if ontoVersion == "original": - response = fetch_original(ontology, headers, disableRemovingRedirects) - elif ontoVersion == "originalFailoverLiveLatest": + if config.ontoVersion == "original": + response = fetch_original(ontology, headers, config) + elif config.ontoVersion == "originalFailoverLiveLatest": response = fetch_failover( - wrapped_request, ontology, headers, disableRemovingRedirects + wrapped_request, ontology, headers, config.disableRemovingRedirects ) - elif ontoVersion == "latestArchived": + elif config.ontoVersion == "latestArchived": response = fetch_latest_archived(wrapped_request, ontology, headers) - elif ontoVersion == "timestampArchived": - response = fetch_timestamp_archived( - wrapped_request, ontology, headers, timestamp - ) + elif config.ontoVersion == "timestampArchived": + response = fetch_timestamp_archived(wrapped_request, ontology, headers, config) # Commenting the manifest related part because it is not supported in the current version # elif ontoVersion == 'dependencyManifest': # response = fetch_dependency_manifest(ontology, headers, manifest) @@ -188,7 +192,7 @@ def fetch_latest_archived(wrapped_request, ontology, headers): return request_ontology(dbpedia_url, headers) -def fetch_timestamp_archived(wrapped_request, ontology, headers, timestamp): +def fetch_timestamp_archived(wrapped_request, ontology, headers, config): if not is_archivo_ontology_request(wrapped_request): logger.info( "Data needs to be fetched from Archivo, but ontology is not available on Archivo." @@ -196,7 +200,7 @@ def fetch_timestamp_archived(wrapped_request, ontology, headers, timestamp): return mock_response_404() logger.info("Fetch archivo timestamp") format = get_format_from_accept_header(headers) - dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={timestamp}" + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={config.timestamp}" logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) From 8e232fdb2e5871c2998b2b073e663fd57214352c Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:33:08 +0200 Subject: [PATCH 28/35] update proxy logic function def and add do_intercept hook --- ontologytimemachine/custom_proxy.py | 30 +++++- ontologytimemachine/utils/proxy_logic.py | 112 ++++++++++++----------- 2 files changed, 83 insertions(+), 59 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 7e8ade5..368ec10 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -3,8 +3,11 @@ from proxy.common.utils import build_http_response from ontologytimemachine.utils.mock_responses import mock_response_403 from ontologytimemachine.proxy_wrapper import HttpRequestWrapper -from ontologytimemachine.utils.proxy_logic import get_response_from_request -from ontologytimemachine.utils.proxy_logic import if_intercept_host +from ontologytimemachine.utils.proxy_logic import ( + get_response_from_request, + if_intercept_host, + is_archivo_ontology_request, +) from ontologytimemachine.utils.config import Config, parse_arguments from http.client import responses import proxy @@ -29,7 +32,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.config = config - def before_upstream_connection(self, request: HttpParser): + def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: + print(config) logger.info("Before upstream connection hook") logger.info( f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" @@ -40,7 +44,7 @@ def before_upstream_connection(self, request: HttpParser): logger.info(f"HTTPS interception mode: {self.config.httpsInterception}") # Only intercept if interception is enabled - if if_intercept_host(self.config.httpsInterception): + if if_intercept_host(self.config): logger.info("HTTPS interception is on, forwardig the request") return request else: @@ -56,7 +60,23 @@ def before_upstream_connection(self, request: HttpParser): return request - def handle_client_request(self, request: HttpParser): + def do_intercept(self, _request: HttpParser) -> bool: + wrapped_request = HttpRequestWrapper(_request) + if self.config.httpsInterception in ["all", "none"]: + return True + elif self.config.httpsInterception in ["block"]: + return False + elif self.config.httpsInterception in ["archivo"]: + if is_archivo_ontology_request(wrapped_request): + return True + return False + else: + logger.info( + f"httpsInterception: {self.config.httpsInterception} option is not allowed." + ) + return False + + def handle_client_request(self, request: HttpParser) -> HttpParser: logger.info("Handle client request hook") logger.info( f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}" diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 526db4c..c479547 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -1,17 +1,19 @@ import logging import requests -import rdflib -from urllib.parse import urlparse from ontologytimemachine.utils.utils import ( set_onto_format_headers, get_format_from_accept_header, ) -from ontologytimemachine.utils.utils import parse_accept_header_with_priority -from ontologytimemachine.utils.utils import archivo_api, passthrough_status_codes -from ontologytimemachine.utils.mock_responses import mock_response_500 +from ontologytimemachine.utils.download_archivo_urls import load_archivo_urls +from ontologytimemachine.utils.utils import ( + parse_accept_header_with_priority, + archivo_api, + passthrough_status_codes, +) from ontologytimemachine.utils.mock_responses import ( - mock_response_404, mock_response_403, + mock_response_404, + mock_response_500, ) from typing import Set, Tuple @@ -22,14 +24,10 @@ logger = logging.getLogger(__name__) -ARCHIVO_PARSED_URLS: Set[Tuple[str, str]] = set() - - -def if_intercept_host(https_intercept): - print(https_intercept) - if https_intercept in ["none", "all"]: +def if_intercept_host(config): + if config.httpsInterception in ["none", "all"]: return True - elif https_intercept in ["block"]: + elif config.httpsInterception in ["block"]: return False return False @@ -43,19 +41,6 @@ def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, only_ontologie return False -def load_archivo_urls(): - """Load the archivo URLs into the global variable if not already loaded.""" - global ARCHIVO_PARSED_URLS - print(ARCHIVO_PARSED_URLS) - if not ARCHIVO_PARSED_URLS: # Load only if the set is empty - logger.info("Loading archivo ontologies from file") - with open("ontologytimemachine/utils/archivo_ontologies.txt", "r") as file: - ARCHIVO_PARSED_URLS = { - (urlparse(line.strip()).netloc, urlparse(line.strip()).path) - for line in file - } - - def get_response_from_request(wrapped_request, config): do_deny = do_deny_request_due_non_archivo_ontology_uri( wrapped_request, config.restrictedAccess @@ -66,13 +51,7 @@ def get_response_from_request(wrapped_request, config): ) return mock_response_403 - response = proxy_logic( - wrapped_request, - config.ontoFormat, - config.ontoVersion, - config.disableRemovingRedirects, - config.timestamp, - ) + response = proxy_logic(wrapped_request, config) return response @@ -82,15 +61,43 @@ def is_archivo_ontology_request(wrapped_request): # Ensure the archivo URLs are loaded load_archivo_urls() + from ontologytimemachine.utils.download_archivo_urls import ARCHIVO_PARSED_URLS # Extract the request's host and path - request_host = wrapped_request.get_request().host.decode("utf-8") - request_path = wrapped_request.get_request().path.decode("utf-8") + request_host = wrapped_request.get_request_host() + request_path = wrapped_request.get_request_path() + + print(f"Host: {request_host}") + print(f"Path: {request_path}") + print((request_host, request_path)) + print(list(ARCHIVO_PARSED_URLS)[0]) + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True - print((request_host, request_path) in ARCHIVO_PARSED_URLS) + # Remove last hash and check again + if request_path.endswith("/"): + request_path = request_path.rstrip("/") + if (request_host, request_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True - # Check if the (host, path) tuple exists in ARCHIVO_PARSED_URLS - return (request_host, request_path) in ARCHIVO_PARSED_URLS + # Cut the last part of the path + + path_parts = request_path.split("/") + new_path = "/".join(path_parts[:-1]) + print(f"New path: {new_path}") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True + + new_path = "/".join(path_parts[:-2]) + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + return True + + logger.info(f"Requested URL: {request_host+request_path} is NOT in Archivo") + return False def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): @@ -106,12 +113,11 @@ def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): return mock_response_404() -def proxy_logic( - wrapped_request, ontoFormat, ontoVersion, disableRemovingRedirects, timestamp -): +# change the function definition and pass only the config +def proxy_logic(wrapped_request, config): logger.info("Proxy has to intervene") - set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion) + set_onto_format_headers(wrapped_request, config) headers = wrapped_request.get_request_headers() ontology, _, _ = wrapped_request.get_request_url_host_path() @@ -119,22 +125,20 @@ def proxy_logic( # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through format = get_format_from_accept_header(headers) - if not format and ontoVersion != "original": + if not format and config.ontoVersion != "original": logger.info(f"No format can be used from Archivo") return mock_response_500 - if ontoVersion == "original": - response = fetch_original(ontology, headers, disableRemovingRedirects) - elif ontoVersion == "originalFailoverLiveLatest": + if config.ontoVersion == "original": + response = fetch_original(ontology, headers, config) + elif config.ontoVersion == "originalFailoverLiveLatest": response = fetch_failover( - wrapped_request, ontology, headers, disableRemovingRedirects + wrapped_request, ontology, headers, config.disableRemovingRedirects ) - elif ontoVersion == "latestArchived": + elif config.ontoVersion == "latestArchived": response = fetch_latest_archived(wrapped_request, ontology, headers) - elif ontoVersion == "timestampArchived": - response = fetch_timestamp_archived( - wrapped_request, ontology, headers, timestamp - ) + elif config.ontoVersion == "timestampArchived": + response = fetch_timestamp_archived(wrapped_request, ontology, headers, config) # Commenting the manifest related part because it is not supported in the current version # elif ontoVersion == 'dependencyManifest': # response = fetch_dependency_manifest(ontology, headers, manifest) @@ -188,7 +192,7 @@ def fetch_latest_archived(wrapped_request, ontology, headers): return request_ontology(dbpedia_url, headers) -def fetch_timestamp_archived(wrapped_request, ontology, headers, timestamp): +def fetch_timestamp_archived(wrapped_request, ontology, headers, config): if not is_archivo_ontology_request(wrapped_request): logger.info( "Data needs to be fetched from Archivo, but ontology is not available on Archivo." @@ -196,7 +200,7 @@ def fetch_timestamp_archived(wrapped_request, ontology, headers, timestamp): return mock_response_404() logger.info("Fetch archivo timestamp") format = get_format_from_accept_header(headers) - dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={timestamp}" + dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={config.timestamp}" logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) From 3d8435d29228e46d4d77ce92d28178ab84b73750 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:34:57 +0200 Subject: [PATCH 29/35] fix wrapper for host and path --- ontologytimemachine/proxy_wrapper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index 785d604..6829154 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -31,11 +31,11 @@ def is_https_request(self) -> bool: pass @abstractmethod - def get_request_host(self) -> Any: + def get_request_host(self) -> str: pass @abstractmethod - def get_request_path(self) -> Any: + def get_request_path(self) -> str: pass @abstractmethod @@ -77,7 +77,7 @@ def get_request_host(self) -> str: return self.request.host.decode("utf-8") def get_request_path(self) -> str: - return self.request.host.decode("utf-8") + return self.request.path.decode("utf-8") def get_request_headers(self) -> Dict[str, str]: headers: Dict[str, str] = {} From 5a4c00a91cd3a60aa67cce9052e583aaf3d87bed Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:36:30 +0200 Subject: [PATCH 30/35] fix downlaod archivo --- .../utils/download_archivo_urls.py | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 ontologytimemachine/utils/download_archivo_urls.py diff --git a/ontologytimemachine/utils/download_archivo_urls.py b/ontologytimemachine/utils/download_archivo_urls.py new file mode 100644 index 0000000..ed3065c --- /dev/null +++ b/ontologytimemachine/utils/download_archivo_urls.py @@ -0,0 +1,139 @@ +import os +import hashlib +import logging +import requests +import schedule +import time +import csv +from datetime import datetime, timedelta +from urllib.parse import urlparse +from typing import Set, Tuple + + +ARCHIVO_PARSED_URLS: Set[Tuple[str, str]] = set() + + +ARCHIVO_FILE_PATH = "ontologytimemachine/utils/archivo_ontologies_download.txt" +ARCHIVO_URL = "https://databus.dbpedia.org/ontologies/archivo-indices/ontologies/2024.07.26-220000/ontologies_type=official.csv" +HASH_FILE_PATH = "ontologytimemachine/utils/archivo_ontologies_hash.txt" + + +LAST_DOWNLOAD_TIMESTAMP = None +DOWNLOAD_INTERVAL = timedelta(days=1) # 1 day interval for checking the download + + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + + +def schedule_daily_download(): + """Schedule the download to run at 3 AM every day.""" + schedule.every().day.at("03:00").do(download_archivo_urls) + + while True: + schedule.run_pending() + time.sleep(60) # Check every minute if there’s a scheduled task + + +# Start the scheduler in the background +def start_scheduler(): + logger.info("Starting the scheduler for daily archivo ontology download.") + schedule_daily_download() + + +# Function to calculate hash of the downloaded file +def calculate_file_hash(file_path): + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() + + +# Function to download and update archivo URLs file +def download_archivo_urls(): + """Download the archivo ontologies file, extract the first column, and save to a text file if a new version is available.""" + try: + logger.info("Checking for new version of archivo ontologies") + + # Download the latest archivo ontologies CSV + response = requests.get(ARCHIVO_URL) + response.raise_for_status() # Ensure the request was successful + + # Save the file temporarily to calculate the hash + temp_file_path = "temp_ontology_indices.csv" + with open(temp_file_path, "wb") as temp_file: + temp_file.write(response.content) + + # Calculate the hash of the new file + new_file_hash = calculate_file_hash(temp_file_path) + + # Compare with the existing hash if available + if os.path.exists(HASH_FILE_PATH): + with open(HASH_FILE_PATH, "r") as hash_file: + old_file_hash = hash_file.read().strip() + else: + old_file_hash = None + + if new_file_hash != old_file_hash: + # New version detected, extract the first column and save to the text file + with open(temp_file_path, "r", newline="", encoding="utf-8") as csv_file: + csv_reader = csv.reader(csv_file, delimiter=",") + with open(ARCHIVO_FILE_PATH, "w") as txt_file: + for row in csv_reader: + if row: # Ensure row is not empty + print(row) + txt_file.write( + row[0].strip() + "\n" + ) # Write only the first column (URL) to the text file + + # Save the new hash + with open(HASH_FILE_PATH, "w") as hash_file: + hash_file.write(new_file_hash) + + logger.info("New version of archivo ontologies downloaded and saved.") + else: + # No new version, remove the temporary file + os.remove(temp_file_path) + logger.info("No new version of archivo ontologies detected.") + + # Update the last download timestamp + global LAST_DOWNLOAD_TIMESTAMP + LAST_DOWNLOAD_TIMESTAMP = datetime.now() + + except requests.RequestException as e: + logger.error(f"Failed to download archivo ontologies: {e}") + + +def load_archivo_urls(): + """Load the archivo URLs into the global variable if not already loaded or if a day has passed since the last download.""" + global ARCHIVO_PARSED_URLS + global LAST_DOWNLOAD_TIMESTAMP + + # Check if ARCHIVO_PARSED_URLS is empty or the last download was over a day ago + if not ARCHIVO_PARSED_URLS or ( + LAST_DOWNLOAD_TIMESTAMP is None + or datetime.now() - LAST_DOWNLOAD_TIMESTAMP > DOWNLOAD_INTERVAL + ): + logger.info( + "ARCHIVO_PARSED_URLS is empty or more than a day has passed since the last download." + ) + download_archivo_urls() + + # Load archivo URLs after downloading or if already present + if not ARCHIVO_PARSED_URLS: # Load only if the set is empty + logger.info("Loading archivo ontologies from file") + try: + with open(ARCHIVO_FILE_PATH, "r") as file: + ARCHIVO_PARSED_URLS = { + (urlparse(line.strip()).netloc, urlparse(line.strip()).path) + for line in file + } + logger.info(f"Loaded {len(ARCHIVO_PARSED_URLS)} ontology URLs.") + + except FileNotFoundError: + logger.error("Archivo ontology file not found.") + except Exception as e: + logger.error(f"Error loading archivo ontology URLs: {e}") From 93fb8bed552acf767053a8990939513b8f84e7f1 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 11:39:16 +0200 Subject: [PATCH 31/35] fix tests --- ontologytimemachine/utils/utils.py | 116 +++++++++++++++--------- poetry.lock | 16 +++- pyproject.toml | 1 + tests/test_integration.py | 138 ++++++++++++++++------------- tests/test_proxy_logic.py | 19 ++-- 5 files changed, 176 insertions(+), 114 deletions(-) diff --git a/ontologytimemachine/utils/utils.py b/ontologytimemachine/utils/utils.py index fb26e65..36075c7 100644 --- a/ontologytimemachine/utils/utils.py +++ b/ontologytimemachine/utils/utils.py @@ -3,69 +3,101 @@ from werkzeug.http import parse_accept_header -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) -archivo_api = 'https://archivo.dbpedia.org/download' -archivo_mimetypes = ['application/rdf+xml', 'application/owl+xml', 'text/turtle', 'application/n-triples'] +archivo_api = "https://archivo.dbpedia.org/download" +archivo_mimetypes = [ + "application/rdf+xml", + "application/owl+xml", + "text/turtle", + "application/n-triples", +] passthrough_status_codes = [ - 100, 101, 102, 103, + 100, + 101, + 102, + 103, 200, - 300, 301, 302, 303, 304, 307, 308, + 300, + 301, + 302, + 303, + 304, + 307, + 308, 451, ] -def get_mime_type(format='turtle'): +def get_mime_type(format="turtle"): # Define a mapping of formats to MIME types format_to_mime = { - 'turtle': 'text/turtle', - 'ntriples': 'application/n-triples', - 'rdfxml': 'application/rdf+xml', - 'htmldocu': 'text/html' + "turtle": "text/turtle", + "ntriples": "application/n-triples", + "rdfxml": "application/rdf+xml", + "htmldocu": "text/html", } - + # Return the MIME type based on the format or use a generic default - return format_to_mime.get(format, 'text/turtle') + return format_to_mime.get(format, "text/turtle") def map_mime_to_format(mime_type): # Map file extensions to formats mime_to_format = { - 'application/rdf+xml': 'owl', # Common MIME type for OWL files - 'application/owl+xml': 'owl', # Specific MIME type for OWL - 'text/turtle': 'ttl', # MIME type for Turtle format - 'application/n-triples': 'nt', # MIME type for N-Triples format + "application/rdf+xml": "owl", # Common MIME type for OWL files + "application/owl+xml": "owl", # Specific MIME type for OWL + "text/turtle": "ttl", # MIME type for Turtle format + "application/n-triples": "nt", # MIME type for N-Triples format } - + return mime_to_format.get(mime_type, None) -def set_onto_format_headers(wrapped_request, ontoFormat, ontoVersion): - logger.info(f'Setting headers based on ontoFormat: {ontoFormat} and ontoVersion: {ontoVersion}') +def set_onto_format_headers(wrapped_request, config): + logger.info( + f"Setting headers based on ontoFormat: {config.ontoFormat} and ontoVersion: {config.ontoVersion}" + ) # if ontoVersion is original and patchAcceptUpstream is False nothing to do here - if ontoVersion == 'original' and not ontoFormat['patchAcceptUpstream']: + if ( + config.ontoVersion == "original" + and not config.ontoFormat["patchAcceptUpstream"] + ): return - + # Determine the correct MIME type for the format - mime_type = get_mime_type(ontoFormat['format']) - logger.info(f'Requested mimetype by proxy: {mime_type}') + mime_type = get_mime_type(config.ontoFormat["format"]) + logger.info(f"Requested mimetype by proxy: {mime_type}") # Define conditions for modifying the accept header request_accept_header = wrapped_request.get_request_accept_header() - logger.info(f'Accept header by request: {request_accept_header}') + logger.info(f"Accept header by request: {request_accept_header}") req_headers_with_priority = parse_accept_header_with_priority(request_accept_header) req_headers = [x[0] for x in req_headers_with_priority] - if not req_headers and ontoFormat['precedence'] in ['default', ['enforcedPriority']]: + if not req_headers and config.ontoFormat["precedence"] in [ + "default", + ["enforcedPriority"], + ]: wrapped_request.set_request_accept_header(mime_type) - elif len(req_headers) == 1 and req_headers[0] == '*/*' and ontoFormat['precedence'] in ['default', 'enforcedPriority']: + elif ( + len(req_headers) == 1 + and req_headers[0] == "*/*" + and config.ontoFormat["precedence"] in ["default", "enforcedPriority"] + ): wrapped_request.set_request_accept_header(mime_type) - elif len(req_headers) > 1 and mime_type in req_headers and ontoFormat['precedence'] == 'enforcedPriority': + elif ( + len(req_headers) > 1 + and mime_type in req_headers + and config.ontoFormat["precedence"] == "enforcedPriority" + ): wrapped_request.set_request_accept_header(mime_type) - elif ontoFormat['precedence'] == 'always': + elif config.ontoFormat["precedence"] == "always": wrapped_request.set_request_accept_header(mime_type) @@ -77,7 +109,9 @@ def select_highest_priority_mime_from_archivo(mime_list): highest_priority = sorted_mime_list[0][1] # Filter MIME types that match the highest priority - highest_priority_mimes = [mime for mime, priority in sorted_mime_list if priority == highest_priority] + highest_priority_mimes = [ + mime for mime, priority in sorted_mime_list if priority == highest_priority + ] # Check if any of the highest priority MIME types are in the archivo list for mime in highest_priority_mimes: @@ -89,13 +123,13 @@ def select_highest_priority_mime_from_archivo(mime_list): def parse_accept_header_with_priority(accept_header): - logger.info('Parse accept header') + logger.info("Parse accept header") # Parse the Accept header to extract MIME types and their priority (q values) parsed = parse_accept_header(accept_header) - + # Create a list of tuples with MIME types and their corresponding q values mime_types_with_priority = [(item[0], item[1]) for item in parsed] - logger.info(f'Accept headers with priority: {mime_types_with_priority}') + logger.info(f"Accept headers with priority: {mime_types_with_priority}") return mime_types_with_priority @@ -105,18 +139,20 @@ def get_format_from_accept_header(headers): return None # Map MIME types to formats - accept_header = headers.get('Accept', None) - logger.info(f'Accept header: {accept_header}') + accept_header = headers.get("Accept", None) + logger.info(f"Accept header: {accept_header}") if not accept_header: return None - + accept_header_with_priority = parse_accept_header_with_priority(accept_header) - - selected_mimetype = select_highest_priority_mime_from_archivo(accept_header_with_priority) + + selected_mimetype = select_highest_priority_mime_from_archivo( + accept_header_with_priority + ) if not selected_mimetype: - logger.info(f'The requested mimetype is not supported by DBpedia Archivo') + logger.info(f"The requested mimetype is not supported by DBpedia Archivo") return None - + format = map_mime_to_format(selected_mimetype) - return format \ No newline at end of file + return format diff --git a/poetry.lock b/poetry.lock index e0689c0..1e34442 100644 --- a/poetry.lock +++ b/poetry.lock @@ -364,6 +364,20 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "schedule" +version = "1.2.2" +description = "Job scheduling for humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "schedule-1.2.2-py3-none-any.whl", hash = "sha256:5bef4a2a0183abf44046ae0d164cadcac21b1db011bdd8102e4a0c1e91e06a7d"}, + {file = "schedule-1.2.2.tar.gz", hash = "sha256:15fe9c75fe5fd9b9627f3f19cc0ef1420508f9f9a46f45cd0769ef75ede5f0b7"}, +] + +[package.extras] +timezone = ["pytz"] + [[package]] name = "six" version = "1.16.0" @@ -423,4 +437,4 @@ watchdog = ["watchdog (>=2.3)"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "aa072190e1a5c335c379c9f3ab09b14dfcf718050b38b08441ba2a91ffefd935" +content-hash = "9efdbca22e8f7d122208d160253c194f4f3d177e77a011491bbaac34fac5c237" diff --git a/pyproject.toml b/pyproject.toml index 0232beb..ebce3c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ requests = "^2.32.3" proxy-py = "^2.4.4" rdflib = "^7.0.0" werkzeug = "^3.0.4" +schedule = "^1.2.2" [build-system] diff --git a/tests/test_integration.py b/tests/test_integration.py index e7a2ef6..472fa3d 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -6,13 +6,10 @@ from ontologytimemachine.custom_proxy import IP, PORT -PROXY = f'{IP}:{PORT}' -HTTP_PROXY = f'http://{PROXY}' -HTTPS_PROXY = f'http://{PROXY}' -PROXIES = { - "http": HTTP_PROXY, - "https": HTTPS_PROXY -} +PROXY = f"{IP}:{PORT}" +HTTP_PROXY = f"http://{PROXY}" +HTTPS_PROXY = f"http://{PROXY}" +PROXIES = {"http": HTTP_PROXY, "https": HTTPS_PROXY} CA_CERT_PATH = "ca-cert.pem" @@ -21,15 +18,15 @@ # # Start the proxy server in a subprocess # process = subprocess.Popen( # [ -# 'python3', 'ontologytimemachine/custom_proxy.py', +# 'python3', 'ontologytimemachine/custom_proxy.py', # ], # stdout=subprocess.PIPE, # stderr=subprocess.PIPE # ) - + # # Wait a bit to ensure the server starts # time.sleep(5) - + # yield # "http://0.0.0.0:8899" # # Terminate the proxy server after tests @@ -38,90 +35,91 @@ def test_12_data_globalchange(): - iri = 'http://data.globalchange.gov/gcis.owl' - generic_test(iri, 'text/turtle') + iri = "http://data.globalchange.gov/gcis.owl" + generic_test(iri, "text/turtle") def test_13_data_ontotext(): - iri = 'http://data.ontotext.com/resource/leak/' - generic_test(iri, 'text/turtle') + iri = "http://data.ontotext.com/resource/leak/" + generic_test(iri, "text/turtle") def test_1_babelnet(): - iri = 'http://babelnet.org/rdf/' - generic_test(iri, 'text/turtle') + iri = "http://babelnet.org/rdf/" + generic_test(iri, "text/turtle") + def test_2_bag_basisregistraties(): - iri = 'http://bag.basisregistraties.overheid.nl/def/bag' - generic_test(iri, 'text/turtle') + iri = "http://bag.basisregistraties.overheid.nl/def/bag" + generic_test(iri, "text/turtle") def test_3_bblfish(): - iri = 'http://bblfish.net/work/atom-owl/2006-06-06/' - generic_test(iri, 'text/turtle') + iri = "http://bblfish.net/work/atom-owl/2006-06-06/" + generic_test(iri, "text/turtle") def test_4_brk_basisregistraties(): - iri = 'http://brk.basisregistraties.overheid.nl/def/brk' - generic_test(iri, 'text/turtle') + iri = "http://brk.basisregistraties.overheid.nl/def/brk" + generic_test(iri, "text/turtle") def test_5_brt_basisregistraties(): - iri = 'http://brt.basisregistraties.overheid.nl/def/top10nl' - generic_test(iri, 'text/turtle') + iri = "http://brt.basisregistraties.overheid.nl/def/top10nl" + generic_test(iri, "text/turtle") def test_6_brt_basisregistraties_begrippenkader(): - iri = 'http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl' - generic_test(iri, 'text/turtle') + iri = "http://brt.basisregistraties.overheid.nl/id/begrippenkader/top10nl" + generic_test(iri, "text/turtle") def test_7_buzzword(): - iri = 'http://buzzword.org.uk/rdf/personal-link-types#' - generic_test(iri, 'text/turtle') + iri = "http://buzzword.org.uk/rdf/personal-link-types#" + generic_test(iri, "text/turtle") def test_8_catalogus_professorum(): - iri = 'http://catalogus-professorum.org/cpm/2/' - generic_test(iri, 'text/turtle') + iri = "http://catalogus-professorum.org/cpm/2/" + generic_test(iri, "text/turtle") def test_9_data_gov(): - iri = 'http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf' - generic_test(iri, 'text/turtle') + iri = "http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf" + generic_test(iri, "text/turtle") def test_10_data_bigdatagrapes(): - iri = 'http://data.bigdatagrapes.eu/resource/ontology/' - generic_test(iri, 'text/turtle') + iri = "http://data.bigdatagrapes.eu/resource/ontology/" + generic_test(iri, "text/turtle") def test_11_data_europa_esco(): - iri = 'http://data.europa.eu/esco/flow' - generic_test(iri, 'text/turtle') + iri = "http://data.europa.eu/esco/flow" + generic_test(iri, "text/turtle") def test_14_data_ordnancesurvey_50kGazetteer(): - iri = 'http://dbpedia.org/ontology/Person' - generic_test(iri, 'text/turtle') + iri = "http://dbpedia.org/ontology/Person" + generic_test(iri, "text/turtle") def test_15_linked_web_apis(): - iri = 'http://linked-web-apis.fit.cvut.cz/ns/core' - generic_test(iri, 'text/turtle') + iri = "http://linked-web-apis.fit.cvut.cz/ns/core" + generic_test(iri, "text/turtle") def generic_test(iri, content_type): response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 - assert iri in response.content.decode('utf-8') + assert iri in response.content.decode("utf-8") def iri_generic_test(iri): try: response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 - assert iri in response.content.decode('utf-8') + assert iri in response.content.decode("utf-8") print(f"Test passed for IRI: {iri}") except AssertionError: print(f"Test failed for IRI: {iri}") @@ -130,24 +128,40 @@ def iri_generic_test(iri): def get_parameter_combinations(): -# Define the possible values for each parameter - ontoFormat = ['turtle', 'ntriples', 'rdfxml', 'htmldocu'] - ontoPrecedence = ['default', 'enforcedPriority', 'always'] - patchAcceptUpstream = [True, False] - ontoVersion = ['original', 'originalFailoverLive', 'originalFailoverArchivoMonitor', - 'latestArchive', 'timestampArchive', 'dependencyManifest'] - onlyOntologies = [True, False] - httpsIntercept = [True, False] - inspectRedirects = [True, False] - forwardHeaders = [True, False] - subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] - - combinations = list(itertools.product(ontoFormat, ontoPrecedence, patchAcceptUpstream, ontoVersion, - onlyOntologies, httpsIntercept, inspectRedirects, - forwardHeaders, subjectBinarySearchThreshold)) - return combinations - - -if __name__ == '__main__': - + # Define the possible values for each parameter + ontoFormat = ["turtle", "ntriples", "rdfxml", "htmldocu"] + ontoPrecedence = ["default", "enforcedPriority", "always"] + patchAcceptUpstream = [True, False] + ontoVersion = [ + "original", + "originalFailoverLive", + "originalFailoverArchivoMonitor", + "latestArchive", + "timestampArchive", + "dependencyManifest", + ] + onlyOntologies = [True, False] + httpsIntercept = [True, False] + inspectRedirects = [True, False] + forwardHeaders = [True, False] + subjectBinarySearchThreshold = [1, 2, 3, 4, 5, 10, 25, 50, 100] + + combinations = list( + itertools.product( + ontoFormat, + ontoPrecedence, + patchAcceptUpstream, + ontoVersion, + onlyOntologies, + httpsIntercept, + inspectRedirects, + forwardHeaders, + subjectBinarySearchThreshold, + ) + ) + return combinations + + +if __name__ == "__main__": + pytest.main() diff --git a/tests/test_proxy_logic.py b/tests/test_proxy_logic.py index daf3701..07bc93e 100644 --- a/tests/test_proxy_logic.py +++ b/tests/test_proxy_logic.py @@ -11,22 +11,19 @@ class TestProxyLogic(unittest.TestCase): - def test_if_intercept_host(self): - self.assertTrue(if_intercept_host("all")) - self.assertFalse(if_intercept_host("block")) - self.assertTrue(if_intercept_host("none")) - def test_do_deny_request_due_non_archivo_ontology_uri(self): # Assuming we are using some sample data structure class WrappedRequest: - def __init__(self, host, path): - self.host = host - self.path = path + def __init__(self, request): + self.request = {"host": request[0], "path": request[1]} + + def get_request_host(self) -> str: + return self.request["host"].decode("utf-8") - def get_request(self): - return self + def get_request_path(self) -> str: + return self.request["path"].decode("utf-8") - request = WrappedRequest(b"example.com", b"/ontology") + request = WrappedRequest((b"example.com", b"/ontology")) self.assertTrue(do_deny_request_due_non_archivo_ontology_uri(request, True)) self.assertFalse(do_deny_request_due_non_archivo_ontology_uri(request, False)) From 28c5b0878cad69fc0cbf5242729b51673ea018db Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Tue, 15 Oct 2024 13:24:40 +0200 Subject: [PATCH 32/35] fix do_intercept --- ontologytimemachine/custom_proxy.py | 4 ++-- ontologytimemachine/utils/config.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 368ec10..9d07327 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -62,9 +62,9 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: def do_intercept(self, _request: HttpParser) -> bool: wrapped_request = HttpRequestWrapper(_request) - if self.config.httpsInterception in ["all", "none"]: + if self.config.httpsInterception in ["all"]: return True - elif self.config.httpsInterception in ["block"]: + elif self.config.httpsInterception in ["none"]: return False elif self.config.httpsInterception in ["archivo"]: if is_archivo_ontology_request(wrapped_request): diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 6b95861..48093d1 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -45,7 +45,7 @@ class Config: ontoFormat: Dict[str, Any] = None ontoVersion: OntoVersion = (OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST,) restrictedAccess: bool = False - httpsInterception: HttpsInterception = (HttpsInterception.ARCHIVO,) + httpsInterception: HttpsInterception = (HttpsInterception.ALL,) disableRemovingRedirects: bool = False timestamp: str = "" # manifest: Dict[str, Any] = None From 7d4d3a9b83d67f555339ccb7209a7684dd3eecb9 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Wed, 16 Oct 2024 23:38:37 +0200 Subject: [PATCH 33/35] move depencency.ttl to tests --- {ontologytimemachine/utils => tests}/dependency.ttl | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {ontologytimemachine/utils => tests}/dependency.ttl (100%) diff --git a/ontologytimemachine/utils/dependency.ttl b/tests/dependency.ttl similarity index 100% rename from ontologytimemachine/utils/dependency.ttl rename to tests/dependency.ttl From de9b6cd76adc5a3806e0823b119ce7a935e5e2c7 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Wed, 16 Oct 2024 23:38:57 +0200 Subject: [PATCH 34/35] remove prints --- ontologytimemachine/utils/download_archivo_urls.py | 3 +-- ontologytimemachine/utils/proxy_logic.py | 9 ++------- tests/test_integration.py | 5 ++--- tests/test_utils.py | 2 -- 4 files changed, 5 insertions(+), 14 deletions(-) diff --git a/ontologytimemachine/utils/download_archivo_urls.py b/ontologytimemachine/utils/download_archivo_urls.py index ed3065c..030fff5 100644 --- a/ontologytimemachine/utils/download_archivo_urls.py +++ b/ontologytimemachine/utils/download_archivo_urls.py @@ -83,8 +83,7 @@ def download_archivo_urls(): csv_reader = csv.reader(csv_file, delimiter=",") with open(ARCHIVO_FILE_PATH, "w") as txt_file: for row in csv_reader: - if row: # Ensure row is not empty - print(row) + if row: txt_file.write( row[0].strip() + "\n" ) # Write only the first column (URL) to the text file diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index c479547..3fc3eb3 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -34,7 +34,6 @@ def if_intercept_host(config): def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, only_ontologies): if only_ontologies: - print(only_ontologies) is_archivo_ontology = is_archivo_ontology_request(wrapped_request) if not is_archivo_ontology: return True @@ -66,11 +65,7 @@ def is_archivo_ontology_request(wrapped_request): # Extract the request's host and path request_host = wrapped_request.get_request_host() request_path = wrapped_request.get_request_path() - - print(f"Host: {request_host}") - print(f"Path: {request_path}") - print((request_host, request_path)) - print(list(ARCHIVO_PARSED_URLS)[0]) + if (request_host, request_path) in ARCHIVO_PARSED_URLS: logger.info(f"Requested URL: {request_host+request_path} is in Archivo") return True @@ -86,7 +81,7 @@ def is_archivo_ontology_request(wrapped_request): path_parts = request_path.split("/") new_path = "/".join(path_parts[:-1]) - print(f"New path: {new_path}") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: logger.info(f"Requested URL: {request_host+request_path} is in Archivo") return True diff --git a/tests/test_integration.py b/tests/test_integration.py index 472fa3d..f33e088 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -120,11 +120,10 @@ def iri_generic_test(iri): response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH) assert response.status_code == 200 assert iri in response.content.decode("utf-8") - print(f"Test passed for IRI: {iri}") except AssertionError: - print(f"Test failed for IRI: {iri}") + return e except requests.exceptions.RequestException as e: - print(f"Request failed for IRI: {iri}, Error: {e}") + return e def get_parameter_combinations(): diff --git a/tests/test_utils.py b/tests/test_utils.py index 60cc702..fb0ca0a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -56,8 +56,6 @@ def test_parse_accept_header_with_priority(self): ("application/xml", 0.9), ("*/*", 0.8), ] - print(parsed_result) - print(expected_result) self.assertEqual(parsed_result, expected_result) def test_get_format_from_accept_header(self): From a1c47cf2cda135cb9ab4cc0ebb4c0f95b9ed95b7 Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Wed, 16 Oct 2024 23:55:18 +0200 Subject: [PATCH 35/35] rename block function --- ontologytimemachine/custom_proxy.py | 4 ++-- ontologytimemachine/utils/proxy_logic.py | 2 +- tests/test_proxy_logic.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index 9d07327..f147b60 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -5,7 +5,7 @@ from ontologytimemachine.proxy_wrapper import HttpRequestWrapper from ontologytimemachine.utils.proxy_logic import ( get_response_from_request, - if_intercept_host, + if_not_block_host, is_archivo_ontology_request, ) from ontologytimemachine.utils.config import Config, parse_arguments @@ -44,7 +44,7 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: logger.info(f"HTTPS interception mode: {self.config.httpsInterception}") # Only intercept if interception is enabled - if if_intercept_host(self.config): + if if_not_block_host(self.config): logger.info("HTTPS interception is on, forwardig the request") return request else: diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 3fc3eb3..77655d5 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) -def if_intercept_host(config): +def if_not_block_host(config): if config.httpsInterception in ["none", "all"]: return True elif config.httpsInterception in ["block"]: diff --git a/tests/test_proxy_logic.py b/tests/test_proxy_logic.py index 07bc93e..473e4bd 100644 --- a/tests/test_proxy_logic.py +++ b/tests/test_proxy_logic.py @@ -1,6 +1,6 @@ import unittest from ontologytimemachine.utils.proxy_logic import ( - if_intercept_host, + if_not_block_host, do_deny_request_due_non_archivo_ontology_uri, load_archivo_urls, is_archivo_ontology_request,