From 1e0d72629a720416382e5a9359812713de9d766f Mon Sep 17 00:00:00 2001 From: Philip Durbin Date: Wed, 11 Dec 2024 21:22:09 -0500 Subject: [PATCH] add debugging --- repo2docker/app.py | 13 +++++++++++++ repo2docker/contentproviders/dataverse.py | 13 +++++++++++++ repo2docker/contentproviders/doi.py | 2 ++ repo2docker/utils.py | 5 +++++ 4 files changed, 33 insertions(+) diff --git a/repo2docker/app.py b/repo2docker/app.py index 71171405d..15f672a9a 100755 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -491,6 +491,18 @@ def fetch(self, url, ref, checkout_path): picked_content_provider = None for ContentProvider in self.content_providers: cp = ContentProvider() + self.log.info(f"DEBUG: content provider is {cp.__class__.__name__}...") + if "Local" in {cp.__class__.__name__}: + print("skipping") + continue + if "Zenodo" in {cp.__class__.__name__}: + print("skipping") + continue + if "Figshare" in {cp.__class__.__name__}: + print("skipping") + continue + print("") + self.log.info(f"DEBUG: About to detect. URL is {url}. Trying content provider {cp.__class__.__name__}\n") spec = cp.detect(url, ref=ref) if spec is not None: picked_content_provider = cp @@ -504,6 +516,7 @@ def fetch(self, url, ref, checkout_path): if swh_token and isinstance(picked_content_provider, contentproviders.Swhid): picked_content_provider.set_auth_token(swh_token) + print("about to fetch...") for log_line in picked_content_provider.fetch( spec, checkout_path, yield_output=self.json_logs ): diff --git a/repo2docker/contentproviders/dataverse.py b/repo2docker/contentproviders/dataverse.py index 9054f53ce..9b222659c 100644 --- a/repo2docker/contentproviders/dataverse.py +++ b/repo2docker/contentproviders/dataverse.py @@ -37,9 +37,13 @@ def detect(self, doi, ref=None, extra_args=None): - doi:10.7910/DVN/6ZXAGT/3YRRYJ """ + self.log.debug(f"DEBUG In Dataverse detect(). DOI is {doi}\n") url = self.doi2url(doi) # Parse the url, to get the base for later API calls parsed_url = urlparse(url) + self.log.info(f"DEBUG: ") + self.log.info(f"DEBUG: parsed_url: {parsed_url}") + # exit(1) # Check if the url matches any known Dataverse installation, bail if not. host = next( @@ -53,9 +57,11 @@ def detect(self, doi, ref=None, extra_args=None): if host is None: return + # exit(1) query_args = parse_qs(parsed_url.query) # Corner case handling if parsed_url.path.startswith("/file.xhtml"): + print("got here 1") # There's no way of getting file information using its persistentId, the only thing we can do is assume that doi # is structured as "doi:/" and try to handle dataset that way. new_doi = doi.rsplit("/", 1)[0] @@ -64,6 +70,7 @@ def detect(self, doi, ref=None, extra_args=None): return return self.detect(new_doi) elif parsed_url.path.startswith("/api/access/datafile"): + print("got here 2") # Raw url pointing to a datafile is a typical output from an External Tool integration entity_id = os.path.basename(parsed_url.path) search_query = "q=entityId:" + entity_id + "&type=file" @@ -84,7 +91,12 @@ def detect(self, doi, ref=None, extra_args=None): parsed_url.path.startswith("/dataset.xhtml") and "persistentId" in query_args ): + print() + print("got here 3, for example from this: jupyter-repo2docker doi:10.7910/DVN/TJCLKP") self.record_id = deep_get(query_args, "persistentId.0") + print("self.record_id BEGIN") + print(self.record_id) + print("self.record_id END") if hasattr(self, "record_id"): return {"record": self.record_id, "host": host} @@ -96,6 +108,7 @@ def fetch(self, spec, output_dir, yield_output=False): yield f"Fetching Dataverse record {record_id}.\n" url = f'{host["url"]}/api/datasets/:persistentId?persistentId={record_id}' + print(f'Downloading from {url}') resp = self.urlopen(url, headers={"accept": "application/json"}) record = resp.json()["data"] diff --git a/repo2docker/contentproviders/doi.py b/repo2docker/contentproviders/doi.py index 065602b7a..2392615ac 100644 --- a/repo2docker/contentproviders/doi.py +++ b/repo2docker/contentproviders/doi.py @@ -49,6 +49,7 @@ def doi2url(self, doi): doi = normalize_doi(doi) try: + self.log.info(f"DEBUG: About to request DOI: {doi}\n") resp = self._request(f"https://doi.org/{doi}") resp.raise_for_status() except HTTPError as e: @@ -60,6 +61,7 @@ def doi2url(self, doi): # default Git provider as this leads to a misleading error. logging.error(f"DOI {doi} does not resolve: {e}") raise + self.log.info(f"DEBUG: returning URL {resp.url}\n") return resp.url else: # Just return what is actulally just a URL diff --git a/repo2docker/utils.py b/repo2docker/utils.py index 9c2769e1d..46472eb0f 100644 --- a/repo2docker/utils.py +++ b/repo2docker/utils.py @@ -443,6 +443,10 @@ def deep_get(dikt, path): deep_get(data, "data.files.0") == data["data"]["files"][0] """ + print("dikt BEGIN") + print(dikt) + print("dikt END") + print("path: " + path) value = dikt for component in path.split("."): if component.isdigit(): @@ -471,6 +475,7 @@ def normalize_doi(val): """Return just the DOI (e.g. 10.1234/jshd123) from a val that could include a url or doi (e.g. https://doi.org/10.1234/jshd123)""" + print("calling normalize_doi()...") m = doi_regexp.match(val) return m.group(2)