Skip to content

Commit

Permalink
Look for only relevant GDC files
Browse files Browse the repository at this point in the history
  • Loading branch information
BinamB committed Dec 5, 2024
1 parent 456c46c commit 551d400
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ repos:
- id: no-commit-to-branch
args: [--branch, develop, --branch, master, --pattern, release/.*]
- repo: https://github.com/psf/black
rev: 20.8b1
rev: 22.3.0
hooks:
- id: black
62 changes: 62 additions & 0 deletions scripts/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import boto3
import csv
import time
import random
from google.cloud import storage
import threading
Expand Down Expand Up @@ -414,3 +415,64 @@ def get_indexd_records():
results[doc.did] = doc.urls

return results


def get_indexd_record_from_GDC_files(manifest_file, logger):
"""
Get single indexd records for all GDC records
Args:
manifest_file (str): GDC manifest location
"""
result = {}
gdc_id_list = []
errored_list = []

indexd_client = IndexClient(
INDEXD["host"],
INDEXD["version"],
(INDEXD["auth"]["username"], INDEXD["auth"]["password"]),
)

def get_record_with_retry(guid, max_retries=5, base_delay=1, backoff_factor=2):
"""
Get a record from indexd with retries and exponential backoff.
Args:
guid (str): The GUID to fetch.
max_retries (int): Maximum number of retry attempts.
base_delay (int): Initial delay between retries in seconds.
backoff_factor (int): Multiplicative factor for exponential backoff.
Returns:
dict: The record from indexd if successful.
Raises:
Exception: If all retries fail.
"""
attempt = 0
while attempt < max_retries:
try:
return indexd_client.get(guid)
except Exception as e:
attempt += 1
if attempt == max_retries:
raise
wait_time = base_delay * (backoff_factor ** (attempt - 1))
logger.warning(
f"Retrying {guid}: attempt {attempt}/{max_retries}, retrying in {wait_time}s. Error: {e}"
)
time.sleep(wait_time)

# open GDC manifest file to extract guids
with open(manifest_file, "r") as csvfile:
csv_reader = csv.DictReader(csvfile, delimiter="\t")
for row in csv_reader:
try:
record = get_record_with_retry(row["id"])
result[row["id"]] = record["urls"]
except Exception as e:
logger.error(f"Could not find record {row['id']}. Errored with {e}")
errored_list.append(row["id"])

if errored_list:
logger.warning(
f"Found {len(errored_list)} guids that weren't found in indexd. Here are all the guids: {errored_list}"
)
return result
5 changes: 4 additions & 1 deletion scripts/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ def run(global_config):

logger.info("scan all copied objects")

indexd_records = utils.get_indexd_records()
indexd_records = {}
for manifest_file in manifest_files:
records = utils.get_indexd_record_from_GDC_files()
indexd_records.update(records)
aws_copied_objects, _ = build_object_dataset_aws(PROJECT_ACL, logger)
gs_copied_objects = utils.build_object_dataset_gs(PROJECT_ACL)

Expand Down

0 comments on commit 551d400

Please sign in to comment.