diff --git a/exporter.py b/exporter.py index ff998bf..f845ad0 100644 --- a/exporter.py +++ b/exporter.py @@ -14,6 +14,11 @@ GCP_BLOB_PREFIX = 'data/kgx-export/' def export_metadata(bucket): + """ + Generate a metadata file from previously created KGX export files + + :param bucket: the GCP storage bucket containing the KGX files + """ services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'edges.tsv.gz', 'edges.tsv.gz') services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz') services.generate_metadata('edges.tsv.gz', 'nodes.tsv.gz', 'KGE') @@ -21,6 +26,12 @@ def export_metadata(bucket): def get_valid_nodes(bucket) -> set[str]: + """ + Retrieve the set of nodes used by a KGX nodes file + + :param bucket: the GCP storage bucket containing the KGX file + :returns a set of node curies + """ services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz') node_set = set([]) with gzip.open('nodes.tsv.gz', 'rb') as infile: @@ -53,7 +64,7 @@ def get_conn() -> pymysql.connections.Connection: logging.info('Starting Main') parser = argparse.ArgumentParser() parser.add_argument('-t', '--target', help='the export target: edges, nodes, or metadata', required=True) - parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True) + parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True) # TODO: Replace with -b --bucket parser.add_argument('-i', '--instance', help='GCP DB instance name') parser.add_argument('-d', '--database', help='database name') parser.add_argument('-u', '--user', help='database username') @@ -70,7 +81,7 @@ def get_conn() -> pymysql.connections.Connection: if args.verbose: logging.getLogger().setLevel(logging.DEBUG) - if args.target == 'metadata': + if args.target == 'metadata': # if we are just exporting metadata a database connection is not necessary export_metadata(uniprot_bucket) else: session_maker = init_db( diff --git a/targeted.py b/targeted.py index 74f7b6f..f8dfa03 100644 --- a/targeted.py +++ b/targeted.py @@ -2,7 +2,6 @@ import logging import math -import sqlalchemy from sqlalchemy import text from sqlalchemy.orm import Session from sqlalchemy import Column, String, Integer @@ -90,6 +89,14 @@ def write_nodes(curies: list[str], normalize_dict: dict[str, dict], output_filen def get_assertion_ids(session, limit=600000, offset=0): + """ + Get the assertion ids to be exported in this run + + :param session: the database session + :param limit: limit for assertion query + :param offset: offset for assertion query + :returns a list of assertion ids + """ id_query = text('SELECT assertion_id FROM targeted.assertion WHERE assertion_id NOT IN ' '(SELECT DISTINCT(assertion_id) ' 'FROM assertion_evidence_feedback af ' @@ -110,6 +117,14 @@ def get_assertion_ids(session, limit=600000, offset=0): def get_edge_data(session: Session, id_list, chunk_size=1000, edge_limit=5) -> list[str]: + """ + Generate edge data for the given list of ids + :param session: the database session + :param id_list: the list of assertion ids + :param chunk_size: the number of edge rows to yield at a time + :param edge_limit: the maximum number of evidence records to return for each edge + :returns edge data for up to chunk_size assertion ids from id_list with up to edge_limit supporting evidence records + """ logging.info(f'\nStarting edge data gathering\nChunk Size: {chunk_size}\nEdge Limit: {edge_limit}\n') logging.info(f'Total Assertions: {len(id_list)}.') logging.info(f'Partition count: {math.ceil(len(id_list) / chunk_size)}')