added documentation for data translators

brain-bican · Oct 4, 2024 · e18846e · e18846e
1 parent b9cd9e4
commit e18846e
Show file tree

Hide file tree

Showing 4 changed files with 212 additions and 22 deletions.
diff --git a/bkbit/data_translators/genome_annotation_translator.py b/bkbit/data_translators/genome_annotation_translator.py
@@ -7,17 +7,18 @@
 4. Serialize the extracted information into JSON-LD format for further use.
 
 Classes:
-    Gff3: A class to handle the entire process of downloading, parsing, and processing GFF3 files.
+    Gff3: The Gff3 class is designed to handle the complete lifecycle of downloading, parsing, and processing GFF3 files from NCBI or Ensembl repositories. It extracts gene annotations and serializes the data into JSON-LD format.
 
 Functions:
-    cli: Command line interface function to execute the module as a script.
+    gff2jsonld: The gff2jsonld function is responsible for creating GeneAnnotation objects from a provided GFF3 file and serializing the extracted information into the JSON-LD format.
 
 Usage:
     The module can be run as a standalone script by executing it with appropriate arguments and options:
     
     ```
     python genome_annotation_translator.py <content_url> -a <assembly_accession> -s <assembly_strain> -l <log_level> -f
     ```
+    
     The script will download the GFF3 file from the specified URL, parse it, and serialize the extracted information into JSON-LD format.
 
 Example:
@@ -180,7 +181,7 @@ def __init__(
             self.taxon_scientific_name = load_json(TAXON_SCIENTIFIC_NAME_PATH)
             self.taxon_common_name = load_json(TAXON_COMMON_NAME_PATH)
         except FileNotFoundError as e:
-            self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download_ncbi_taxonomy' command first." )
+            self.logger.critical("NCBI Taxonomy not downloaded. Run 'bkbit download-ncbi-taxonomy' command first." )
             print(e)
             sys.exit(2)
 

diff --git a/bkbit/data_translators/library_generation_translator.py b/bkbit/data_translators/library_generation_translator.py
@@ -21,7 +21,7 @@
     The module can be run as a standalone script using the command-line interface with the appropriate arguments and options:
 
     ```
-    python specimen_portal.py <nhash_id> -d
+    python specimen_portal.py <nhash_id> [-d]
     ```
 
     This script will parse the nhash ID and serialize the generated data into JSON-LD format, with the option to parse descendants or ancestors.
@@ -95,18 +95,15 @@ class SpecimenPortal:
         serialize_to_jsonld(exclude_none=True, exclude_unset=False):
             Serializes the generated objects into JSON-LD format for further use or storage.
 
-    Static Methods:
-        __check_valueset_membership(enum_type, nimp_value):
-            Checks if a given value belongs to a specified enum.
-    
-    Private Methods:
-        __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
+        parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
             Parses a single nhash ID and optionally saves the result to a JSON-LD file.
 
-        __parse_multiple_nashids(jwt_token, file_path, descendants):
-        Parses multiple nhash IDs from a file and saves the results to JSON-LD files.
-        
+        parse_multiple_nashids(jwt_token, file_path, descendants):
+            Parses multiple nhash IDs from a file and saves the results to JSON-LD files.
 
+    Static Methods:
+        __check_valueset_membership(enum_type, nimp_value):
+            Checks if a given value belongs to a specified enum.
     """
     def __init__(self, jwt_token):
         self.jwt_token = jwt_token
@@ -360,7 +357,7 @@ def serialize_to_jsonld(
         return json.dumps(output_data, indent=2)
 
 
-def __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
+def parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
     """
     Parse a single nashid using the SpecimenPortal class.
 
@@ -388,7 +385,7 @@ def __parse_single_nashid(jwt_token, nhash_id, descendants, save_to_file=False):
         print(sp_obj.serialize_to_jsonld())
 
 
-def __parse_multiple_nashids(jwt_token, file_path, descendants):
+def parse_multiple_nashids(jwt_token, file_path, descendants):
     """
     Parse multiple nashids from a file.
 
@@ -405,7 +402,7 @@ def __parse_multiple_nashids(jwt_token, file_path, descendants):
         nhashids = [line.strip() for line in file.readlines()]
     with Pool() as pool:
         results = pool.starmap(
-            __parse_single_nashid,
+            parse_single_nashid,
             [(jwt_token, nhash_id, descendants, True) for nhash_id in nhashids],
         )
     return results
@@ -438,9 +435,9 @@ def specimen2jsonld(nhash_id: str, descendants: bool):
     if not jwt_token or jwt_token == "":
         raise ValueError("JWT token is required")
     if os.path.isfile(nhash_id):
-        __parse_multiple_nashids(jwt_token, nhash_id, descendants)
+        parse_multiple_nashids(jwt_token, nhash_id, descendants)
     else:
-        __parse_single_nashid(jwt_token, nhash_id, descendants)
+        parse_single_nashid(jwt_token, nhash_id, descendants)
 
 
 if __name__ == "__main__":

diff --git a/docs/data_translators.rst b/docs/data_translators.rst
@@ -0,0 +1,181 @@
+.. _datatranslators:
+
+Data Translators
+======
+
+Annotated Genome Data
+----------------------
+Generate JSON-LD files for annotated genes from a given GFF3 file. Currently GFF3 files from ENSEMBL and NCBI are supported.
+
+Each JSON-LD file will contain:
+
+- GeneAnnotation objects
+- 1 GenomeAnnotation object
+- 1 GenomeAssembly object
+- 1 OrganismTaxon object
+- 1 Checksum object
+
+Command Line 
+.............
+
+``bkbit gff2jsonld``
+,,,,,,,,,,,,,,,,,,,,,
+
+    .. code-block:: bash
+
+        $ bkbit gff2jsonld [OPTIONS] GFF3_URL
+
+Options
+,,,,,,,,
+
+    ``-a, --assembly_accession``
+        ID assigned to the genomic assembly used in the GFF3 file.
+        **Note: Must be provided when using ENSEMBL GFF3 files**
+
+    ``-s, --assembly_strain``
+        Specific strain of the organism associated with the GFF3 file.
+
+    ``-l, --log_level``
+        Logging level.
+
+        Default:
+            WARNING
+        Options:
+            DEBUG | INFO | WARNING | ERROR | CRITICIAL
+
+    ``-f, --log_to_file``
+        Log to a file instead of the console.
+
+        Default:
+            FALSE
+
+Arguments
+,,,,,,,,
+
+    ``GFF3_URL``
+        URL to the GFF3 file.
+
+Examples 
+.........
+
+Example 1: NCBI GFF3 file
+,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    $ bkbit gff2jsonld 'https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9823/106/GCF_000003025.6_Sscrofa11.1/GCF_000003025.6_Sscrofa11.1_genomic.gff.gz' > output.jsonld
+
+
+Example 2: ENSEMBL GFF3 file
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    $ bkbit gff2jsonld -a 'GCF_003339765.1' 'https://ftp.ensembl.org/pub/release-104/gff3/macaca_mulatta/Macaca_mulatta.Mmul_10.104.gff3.gz' > output.jsonld
+
+
+Specimen Data
+----------------------
+Generate JSON-LD files for specimens, subjects, and their repective ancestors or descendants. Data is retrieved from the `BICAN Specimen Portal <https://brain-specimenportal.org/>`_. 
+
+Command Line 
+.............
+
+``bkbit specimen2jsonld``
+,,,,,,,,,,,,,,,,,,,,,
+
+    .. code-block:: bash
+
+        $ bkbit specimen2jsonld [OPTIONS] NHASH_ID_OR_FILE
+
+Options
+,,,,,,,,
+
+    ``-d, --decendants``
+        A boolean flag that, when provided, generates BICAN objects for the given NHASH_ID and all of its descendants. 
+        If this flag is not set (DEFAULT), then the ancestors will be processed.
+
+Arguments
+,,,,,,,,
+
+    ``NHASH_ID_OR_FILE``
+        The NHASH_ID of the specimen or a file containing a list of NHASH_IDs. 
+        If a file is provided, the file should contain one NHASH_ID per line.
+
+Environment Variables 
+.............
+
+jwt_token
+,,,,,,,,,
+
+    You **must** set the SpecimenPortal Personal API Token as an environment variable before running ``bkbit specimen2jsonld``. Once set, the token will be used to authenticate with the Specimen Portal API and retrieve the specimen metadata.
+
+    .. code-block:: bash
+
+        $ export jwt_token=specimen_portal_personal_api_token
+
+
+Examples 
+.........
+
+Example 1: Parse a single record and its ancestors
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run specimen2jsonld command 
+    $ bkbit specimen2jsonld 'LP-CVFLMQ819998' > output.jsonld
+
+Example 2: Parse a single record and its descendants
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Run specimen2jsonld command. Important: include '--descendants' flag
+    $ bkbit specimen2jsonld -d 'DO-GICE7463' > output.jsonld
+
+Example 3: Parse a file containing record(s) and their respective ancestors
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Contents of input file 
+    $ cat input_nhash_ids.txt
+    LA-TZWCWB265559FVVNTS329147
+    LA-IAXCCV360563HBFKKM103455
+    LA-JFCEST535498UIPMOH349083
+
+    # Run specimen2jsonld command 
+    $ bkbit specimen2jsonld input_nhash_ids.txt 
+
+    # Expected output 
+    $ ls .
+    LA-TZWCWB265559FVVNTS329147.jsonld
+    LA-IAXCCV360563HBFKKM103455.jsonld
+    LA-JFCEST535498UIPMOH349083.jsonld
+
+Example 4: Parse a file containing record(s) and their respective descendants
+,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+
+.. code-block:: bash
+
+    # Contents of input file 
+    $ cat input_nhash_ids.txt
+    DO-XIQQ6047
+    DO-WFFF3774
+    DO-RMRL6873
+
+    # Run specimenjsonld command. Important: include '--descendants' flag
+    $ bkbit specimen2jsonld -d input_nhash_ids.txt 
+
+    # Expected output 
+    $ ls .
+    DO-XIQQ6047.jsonld
+    DO-WFFF3774.jsonld
+    DO-RMRL6873.jsonld
+
+Structured Anatomical Data
+----------------------------
+
+
+
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,14 +8,25 @@ Brain Knowledge Base Interaction Toolkit Documentation
 This package contains tools to use the BICAN Knowledgebase Data Models.
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Contents:
+   :maxdepth: 1
+   :caption: GETTING STARTED
 
    install
-   bkbit-quickstart
-   contributing
+
+.. toctree::
+   :maxdepth: 2
+   :caption: USAGE
+
+   data_translators
+
+.. toctree::
+   :maxdepth: 1
+   :caption: REFERENCE
+
    modules
 
+
+
 Indices and tables
 ==================