Skip to content

Commit

Permalink
Adding some documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
edgargaticaCU committed Mar 25, 2024
1 parent db3c286 commit 407e8df
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 3 deletions.
15 changes: 13 additions & 2 deletions exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,24 @@
GCP_BLOB_PREFIX = 'data/kgx-export/'

def export_metadata(bucket):
"""
Generate a metadata file from previously created KGX export files
:param bucket: the GCP storage bucket containing the KGX files
"""
services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'edges.tsv.gz', 'edges.tsv.gz')
services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz')
services.generate_metadata('edges.tsv.gz', 'nodes.tsv.gz', 'KGE')
services.upload_to_gcp(bucket, 'KGE/content_metadata.json', GCP_BLOB_PREFIX + 'content_metadata.json')


def get_valid_nodes(bucket) -> set[str]:
"""
Retrieve the set of nodes used by a KGX nodes file
:param bucket: the GCP storage bucket containing the KGX file
:returns a set of node curies
"""
services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz')
node_set = set([])
with gzip.open('nodes.tsv.gz', 'rb') as infile:
Expand Down Expand Up @@ -53,7 +64,7 @@ def get_conn() -> pymysql.connections.Connection:
logging.info('Starting Main')
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--target', help='the export target: edges, nodes, or metadata', required=True)
parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True)
parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True) # TODO: Replace with -b --bucket
parser.add_argument('-i', '--instance', help='GCP DB instance name')
parser.add_argument('-d', '--database', help='database name')
parser.add_argument('-u', '--user', help='database username')
Expand All @@ -70,7 +81,7 @@ def get_conn() -> pymysql.connections.Connection:

if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.target == 'metadata':
if args.target == 'metadata': # if we are just exporting metadata a database connection is not necessary
export_metadata(uniprot_bucket)
else:
session_maker = init_db(
Expand Down
17 changes: 16 additions & 1 deletion targeted.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import logging
import math

import sqlalchemy
from sqlalchemy import text
from sqlalchemy.orm import Session
from sqlalchemy import Column, String, Integer
Expand Down Expand Up @@ -90,6 +89,14 @@ def write_nodes(curies: list[str], normalize_dict: dict[str, dict], output_filen


def get_assertion_ids(session, limit=600000, offset=0):
"""
Get the assertion ids to be exported in this run
:param session: the database session
:param limit: limit for assertion query
:param offset: offset for assertion query
:returns a list of assertion ids
"""
id_query = text('SELECT assertion_id FROM targeted.assertion WHERE assertion_id NOT IN '
'(SELECT DISTINCT(assertion_id) '
'FROM assertion_evidence_feedback af '
Expand All @@ -110,6 +117,14 @@ def get_assertion_ids(session, limit=600000, offset=0):


def get_edge_data(session: Session, id_list, chunk_size=1000, edge_limit=5) -> list[str]:
"""
Generate edge data for the given list of ids
:param session: the database session
:param id_list: the list of assertion ids
:param chunk_size: the number of edge rows to yield at a time
:param edge_limit: the maximum number of evidence records to return for each edge
:returns edge data for up to chunk_size assertion ids from id_list with up to edge_limit supporting evidence records
"""
logging.info(f'\nStarting edge data gathering\nChunk Size: {chunk_size}\nEdge Limit: {edge_limit}\n')
logging.info(f'Total Assertions: {len(id_list)}.')
logging.info(f'Partition count: {math.ceil(len(id_list) / chunk_size)}')
Expand Down

0 comments on commit 407e8df

Please sign in to comment.