Adding some documentation

UCDenver-ccp · Mar 25, 2024 · 407e8df · 407e8df
1 parent db3c286
commit 407e8df
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 3 deletions.
diff --git a/exporter.py b/exporter.py
@@ -14,13 +14,24 @@
 GCP_BLOB_PREFIX = 'data/kgx-export/'
 
 def export_metadata(bucket):
+    """
+    Generate a metadata file from previously created KGX export files
+
+    :param bucket: the GCP storage bucket containing the KGX files
+    """
     services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'edges.tsv.gz', 'edges.tsv.gz')
     services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz')
     services.generate_metadata('edges.tsv.gz', 'nodes.tsv.gz', 'KGE')
     services.upload_to_gcp(bucket, 'KGE/content_metadata.json', GCP_BLOB_PREFIX + 'content_metadata.json')
 
 
 def get_valid_nodes(bucket) -> set[str]:
+    """
+    Retrieve the set of nodes used by a KGX nodes file
+
+    :param bucket: the GCP storage bucket containing the KGX file
+    :returns a set of node curies
+    """
     services.get_from_gcp(bucket, GCP_BLOB_PREFIX + 'nodes.tsv.gz', 'nodes.tsv.gz')
     node_set = set([])
     with gzip.open('nodes.tsv.gz', 'rb') as infile:
@@ -53,7 +64,7 @@ def get_conn() -> pymysql.connections.Connection:
     logging.info('Starting Main')
     parser = argparse.ArgumentParser()
     parser.add_argument('-t', '--target', help='the export target: edges, nodes, or metadata', required=True)
-    parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True)
+    parser.add_argument('-uni', '--uniprot_bucket', help='storage bucket for UniProt data', required=True) # TODO: Replace with -b --bucket
     parser.add_argument('-i', '--instance', help='GCP DB instance name')
     parser.add_argument('-d', '--database', help='database name')
     parser.add_argument('-u', '--user', help='database username')
@@ -70,7 +81,7 @@ def get_conn() -> pymysql.connections.Connection:
 
     if args.verbose:
         logging.getLogger().setLevel(logging.DEBUG)
-    if args.target == 'metadata':
+    if args.target == 'metadata': # if we are just exporting metadata a database connection is not necessary
         export_metadata(uniprot_bucket)
     else:
         session_maker = init_db(

diff --git a/targeted.py b/targeted.py
@@ -2,7 +2,6 @@
 import logging
 import math
 
-import sqlalchemy
 from sqlalchemy import text
 from sqlalchemy.orm import Session
 from sqlalchemy import Column, String, Integer
@@ -90,6 +89,14 @@ def write_nodes(curies: list[str], normalize_dict: dict[str, dict], output_filen
 
 
 def get_assertion_ids(session, limit=600000, offset=0):
+    """
+    Get the assertion ids to be exported in this run
+
+    :param session: the database session
+    :param limit: limit for assertion query
+    :param offset: offset for assertion query
+    :returns a list of assertion ids
+    """
     id_query = text('SELECT assertion_id FROM targeted.assertion WHERE assertion_id NOT IN '
                     '(SELECT DISTINCT(assertion_id) '
                     'FROM assertion_evidence_feedback af '
@@ -110,6 +117,14 @@ def get_assertion_ids(session, limit=600000, offset=0):
 
 
 def get_edge_data(session: Session, id_list, chunk_size=1000, edge_limit=5) -> list[str]:
+    """
+    Generate edge data for the given list of ids
+    :param session: the database session
+    :param id_list: the list of assertion ids
+    :param chunk_size: the number of edge rows to yield at a time
+    :param edge_limit: the maximum number of evidence records to return for each edge
+    :returns edge data for up to chunk_size assertion ids from id_list with up to edge_limit supporting evidence records
+    """
     logging.info(f'\nStarting edge data gathering\nChunk Size: {chunk_size}\nEdge Limit: {edge_limit}\n')
     logging.info(f'Total Assertions: {len(id_list)}.')
     logging.info(f'Partition count: {math.ceil(len(id_list) / chunk_size)}')