Docstrings and README

dmitrymyl · Aug 4, 2021 · d37001e · d37001e
1 parent e9ee5fa
commit d37001e
Show file tree

Hide file tree

Showing 3 changed files with 359 additions and 79 deletions.
diff --git a/README.md b/README.md
@@ -224,7 +224,7 @@ Getting best orthologs:
 3. ```stats.txt```: plain text file with statistics describing every pipeline step.
 4. ```best.ortholog_annotation.tsv``` (optional): plain table with a header and two columns that matches query lncRNAs names with subject species gene names that overlap lncRNAs orthologs.
 
-The principal structure of the output directory is shown below:
+A principal structure of the output directory is shown below:
 ```
 outdir/
 ├─ best.ortholog_annotation.tsv
@@ -479,7 +479,7 @@ Output:
   -output [str]         output filename.
 ```
 ## ```config``` files for CLI arguments
-Instead of entering each CLI argument for every subcommand, a ```.config``` file with CLI arguments can be supplied via ```ortho2align SUBCOMMAND @subcommand.config```. The first line of the file is the name of the subcommand, next lines contain argument names and their values one by one. Check ```config``` directory for sample ```.config``` files.
+Instead of entering each CLI argument for every subcommand, a ```.config``` file with CLI arguments can be supplied via ```ortho2align @subcommand.config```. The first line of the file is the name of the subcommand, next lines contain argument names and their values one by one. Check [configs](./configs) directory for sample ```.config``` files.
 # Testing
 You can test the installed package, whether it works or not. After the package is installed (and the environment activated, if needed), run the commands below:
 ```{bash}
@@ -488,4 +488,4 @@ tar -xzvf test.tar.gz
 cd test
 bash test.sh
 ```
-This will download an archive with test files (see [test](./test)). The main script ```test.sh``` will download genome and annotation files (2 Gb), which will take some time. You will need nearly 7 Gb of free space for unpacked files. After that, ```ortho2align``` will be run on the test data. If installation was correct, you won't encounter any problems and get a full output in the ```result``` directory.
+This will download an archive with test files (see [test](./test)). The test set of RNAs are 96 strRNAs from [here](https://www.sciencedirect.com/science/article/pii/S1097276518307019). The main script [test.sh](./test/test.sh) will download genome and annotation files (2 Gb), which will take some time. You will need nearly 7 Gb of free space for unpacked files. After that, ```ortho2align``` will be run on the test data. If installation was correct, you won't encounter any problems and get a full output in the ```result``` directory.
diff --git a/ortho2align/parsing.py b/ortho2align/parsing.py
@@ -647,6 +647,27 @@ def parse_gtf_attributes(field):
 
 def gtf_parser(fileobj, verbose=False, sequence_file_path=None,
                name_regex=None, name_tag=None, parse_attributes=False):
+    """Parses GTF file.
+
+    Args:
+        fileobj (file): a file-like object or a stream.
+        verbose (bool): if True, will report a progress bar (default: False).
+        sequence_file_path (str, Path): a path to the corresponding genome file
+            (default: None).
+        name_regex (r-str): a regex to extract a name of each genomic feature
+            (default: None).
+        name_tag (str): a name tag in the attributes field (default: None).
+        parse_attributes (bool): if True, will parse attributes field at
+            the expense of performance (default: False).
+
+    Returns:
+        GenomicRangesList: a parsed gtf annotation.
+
+    Raises:
+        ValueError: in case name_regex doesn't match anything in the
+            attributes field.
+        IncorrectLine: in case one of the lines is incorrect gtf line.
+    """
     granges = list()
     if parse_attributes:
         colnames = gtf_fields.keys()
@@ -688,14 +709,42 @@ def gtf_parser(fileobj, verbose=False, sequence_file_path=None,
 
 
 def parse_gff_start(field):
+    """Parses GFF3 start field.
+
+    Args:
+        field (str): GFF3 start field.
+
+    Returns:
+        int: parsed GFF3 start field.
+    """
     return int(field) - 1
 
 
 def parse_gff_end(field):
+    """Parses GFF3 end field.
+
+    Args:
+        field (str): GFF3 end field.
+
+    Returns:
+        int: parsed GFF3 end field.
+    """
     return int(field)
 
 
 def parse_gff_attributes(field):
+    """Parses GFF3 attributes field.
+
+    Args:
+        field (str): GFF3 attributes field.
+
+    Returns:
+        dict: of GFF3 attributes tags and corresponding values.
+
+    Raises:
+        IncorrectGFFAttrs: in case there are no GFF3 attributes
+            in the field or the field cannot be parsed correctly.
+    """
     try:
         data = {tag: value
                 for tag, value in (item.split('=')
@@ -732,6 +781,27 @@ def parse_gff_attributes(field):
 
 def gff_parser(fileobj, verbose=False, sequence_file_path=None,
                name_regex=None, name_tag=None, parse_attributes=False):
+    """Parses GFF3 annotation file.
+
+    Args:
+        fileobj (file): a file-like object or a stream.
+        verbose (bool): if True, will report a progress bar (default: False).
+        sequence_file_path (str, Path): a path to the corresponding genome file
+            (default: None).
+        name_regex (r-str): a regex to extract a name of each genomic feature
+            (default: None).
+        name_tag (str): a name tag in the attributes field (default: None).
+        parse_attributes (bool): if True, will parse attributes field at
+            the expense of performance (default: False).
+
+    Returns:
+        GenomicRangesList: a parsed GFF3 annotation.
+
+    Raises:
+        ValueError: in case name_regex doesn't match anything in the
+            attributes field.
+        IncorrectLine: in case one of the lines is incorrect GFF3 line.
+    """
     granges = list()
     if parse_attributes:
         colnames = gff_fields.keys()
@@ -772,6 +842,14 @@ def gff_parser(fileobj, verbose=False, sequence_file_path=None,
 
 
 def check_bed3(line):
+    """Checks whether a line can be parsed as a BED3 line.
+
+    Args:
+        line (str): a line to be checked.
+
+    Returns:
+        bool: a result of the check.
+    """
     try:
         record = line.strip().split('\t')
         if len(record) != 3:
@@ -784,6 +862,14 @@ def check_bed3(line):
 
 
 def check_bed6(line):
+    """Checks whether a line can be parsed as a BED6 line.
+
+    Args:
+        line (str): a line to be checked.
+
+    Returns:
+        bool: a result of the check.
+    """
     try:
         record = line.strip().split('\t')
         if len(record) != 6:
@@ -796,6 +882,14 @@ def check_bed6(line):
 
 
 def check_bed12(line):
+    """Checks whether a line can be parsed as a BED12 line.
+
+    Args:
+        line (str): a line to be checked.
+
+    Returns:
+        bool: a result of the check.
+    """
     try:
         record = line.strip().split('\t')
         if len(record) != 12:
@@ -808,6 +902,14 @@ def check_bed12(line):
 
 
 def check_gtf(line):
+    """Checks whether a line can be parsed as a GTF line.
+
+    Args:
+        line (str): a line to be checked.
+
+    Returns:
+        bool: a result of the check.
+    """
     try:
         record = line.strip().split('\t')
         if len(record) != 9:
@@ -820,6 +922,14 @@ def check_gtf(line):
 
 
 def check_gff(line):
+    """Checks whether a line can be parsed as a GFF3 line.
+
+    Args:
+        line (str): a line to be checked.
+
+    Returns:
+        bool: a result of the check.
+    """
     try:
         record = line.strip().split('\t')
         if len(record) != 9:
@@ -832,6 +942,32 @@ def check_gff(line):
 
 
 def annotation_sniffer(fileobj, comment='#', separator='\t'):
+    """Defines the type of the annotation file.
+
+    The function reads every line from the start until it finds
+    the first uncommented line. Then it defines a number of fields
+    and checks whether the line can be parsed as a BED3, BED6, GTF,
+    GFF3 or BED12 line. Finally it returns the file pointer to the
+    beginning of the file.
+
+    Args:
+        fileobj (file): a file-like object or a stream that
+            contains the annotation. Must support `seek` method.
+        comment (str): a character used to comment lines at
+            their beginnings (default: '#').
+        separator (str): a character used as a field separator
+            (default: '\t').
+
+    Returns:
+        str: a name of the annotation file type, one of
+            'bed3', 'bed6', 'gtf', 'gff', 'bed12'.
+
+    Raises:
+        EmptyAnnotation: in case there are no lines at all,
+            there are an empty line at the beginning of the file
+            or there are an empty line after the commented lines.
+        UnrecognizedFormat: in case file format can't be recognized.
+    """
     line = fileobj.readline()
 
     while line.startswith(comment):
@@ -873,6 +1009,25 @@ def annotation_sniffer(fileobj, comment='#', separator='\t'):
 
 def parse_annotation(fileobj, verbose=False, sequence_file_path=None,
                      name_regex=None, name_tag=None, parse_attributes=False):
+    """Parses an annotation file.
+
+    First, it checks an annotation type with `annotation_sniffer` and then
+    it parses the file according to that type.
+
+    Args:
+        fileobj (file): a file-like object or a stream.
+        verbose (bool): if True, will report a progress bar (default: False).
+        sequence_file_path (str, Path): a path to the corresponding genome file
+            (default: None).
+        name_regex (r-str): a regex to extract a name of each genomic feature
+            (default: None).
+        name_tag (str): a name tag in the attributes field (default: None).
+        parse_attributes (bool): if True, will parse attributes field at
+            the expense of performance (default: False).
+
+    Returns:
+        GenomicRangesList: a parsed annotation.
+    """
     annotation_type = annotation_sniffer(fileobj)
     if annotation_type == 'bed3':
         return bed3_parser(fileobj, verbose, sequence_file_path)