From d37001e5477f42a2883c3fb4e8fdc0c28f6354b3 Mon Sep 17 00:00:00 2001 From: dmitrymyl Date: Wed, 4 Aug 2021 17:18:18 +0300 Subject: [PATCH] Docstrings and README --- README.md | 6 +- ortho2align/parsing.py | 155 ++++++++++++++++++++++ ortho2align/pipeline.py | 277 +++++++++++++++++++++++++++++----------- 3 files changed, 359 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 0ab0083..ebe5086 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ Getting best orthologs: 3. ```stats.txt```: plain text file with statistics describing every pipeline step. 4. ```best.ortholog_annotation.tsv``` (optional): plain table with a header and two columns that matches query lncRNAs names with subject species gene names that overlap lncRNAs orthologs. -The principal structure of the output directory is shown below: +A principal structure of the output directory is shown below: ``` outdir/ ├─ best.ortholog_annotation.tsv @@ -479,7 +479,7 @@ Output: -output [str] output filename. ``` ## ```config``` files for CLI arguments -Instead of entering each CLI argument for every subcommand, a ```.config``` file with CLI arguments can be supplied via ```ortho2align SUBCOMMAND @subcommand.config```. The first line of the file is the name of the subcommand, next lines contain argument names and their values one by one. Check ```config``` directory for sample ```.config``` files. +Instead of entering each CLI argument for every subcommand, a ```.config``` file with CLI arguments can be supplied via ```ortho2align @subcommand.config```. The first line of the file is the name of the subcommand, next lines contain argument names and their values one by one. Check [configs](./configs) directory for sample ```.config``` files. # Testing You can test the installed package, whether it works or not. After the package is installed (and the environment activated, if needed), run the commands below: ```{bash} @@ -488,4 +488,4 @@ tar -xzvf test.tar.gz cd test bash test.sh ``` -This will download an archive with test files (see [test](./test)). The main script ```test.sh``` will download genome and annotation files (2 Gb), which will take some time. You will need nearly 7 Gb of free space for unpacked files. After that, ```ortho2align``` will be run on the test data. If installation was correct, you won't encounter any problems and get a full output in the ```result``` directory. \ No newline at end of file +This will download an archive with test files (see [test](./test)). The test set of RNAs are 96 strRNAs from [here](https://www.sciencedirect.com/science/article/pii/S1097276518307019). The main script [test.sh](./test/test.sh) will download genome and annotation files (2 Gb), which will take some time. You will need nearly 7 Gb of free space for unpacked files. After that, ```ortho2align``` will be run on the test data. If installation was correct, you won't encounter any problems and get a full output in the ```result``` directory. \ No newline at end of file diff --git a/ortho2align/parsing.py b/ortho2align/parsing.py index d9d3f16..36486d8 100644 --- a/ortho2align/parsing.py +++ b/ortho2align/parsing.py @@ -647,6 +647,27 @@ def parse_gtf_attributes(field): def gtf_parser(fileobj, verbose=False, sequence_file_path=None, name_regex=None, name_tag=None, parse_attributes=False): + """Parses GTF file. + + Args: + fileobj (file): a file-like object or a stream. + verbose (bool): if True, will report a progress bar (default: False). + sequence_file_path (str, Path): a path to the corresponding genome file + (default: None). + name_regex (r-str): a regex to extract a name of each genomic feature + (default: None). + name_tag (str): a name tag in the attributes field (default: None). + parse_attributes (bool): if True, will parse attributes field at + the expense of performance (default: False). + + Returns: + GenomicRangesList: a parsed gtf annotation. + + Raises: + ValueError: in case name_regex doesn't match anything in the + attributes field. + IncorrectLine: in case one of the lines is incorrect gtf line. + """ granges = list() if parse_attributes: colnames = gtf_fields.keys() @@ -688,14 +709,42 @@ def gtf_parser(fileobj, verbose=False, sequence_file_path=None, def parse_gff_start(field): + """Parses GFF3 start field. + + Args: + field (str): GFF3 start field. + + Returns: + int: parsed GFF3 start field. + """ return int(field) - 1 def parse_gff_end(field): + """Parses GFF3 end field. + + Args: + field (str): GFF3 end field. + + Returns: + int: parsed GFF3 end field. + """ return int(field) def parse_gff_attributes(field): + """Parses GFF3 attributes field. + + Args: + field (str): GFF3 attributes field. + + Returns: + dict: of GFF3 attributes tags and corresponding values. + + Raises: + IncorrectGFFAttrs: in case there are no GFF3 attributes + in the field or the field cannot be parsed correctly. + """ try: data = {tag: value for tag, value in (item.split('=') @@ -732,6 +781,27 @@ def parse_gff_attributes(field): def gff_parser(fileobj, verbose=False, sequence_file_path=None, name_regex=None, name_tag=None, parse_attributes=False): + """Parses GFF3 annotation file. + + Args: + fileobj (file): a file-like object or a stream. + verbose (bool): if True, will report a progress bar (default: False). + sequence_file_path (str, Path): a path to the corresponding genome file + (default: None). + name_regex (r-str): a regex to extract a name of each genomic feature + (default: None). + name_tag (str): a name tag in the attributes field (default: None). + parse_attributes (bool): if True, will parse attributes field at + the expense of performance (default: False). + + Returns: + GenomicRangesList: a parsed GFF3 annotation. + + Raises: + ValueError: in case name_regex doesn't match anything in the + attributes field. + IncorrectLine: in case one of the lines is incorrect GFF3 line. + """ granges = list() if parse_attributes: colnames = gff_fields.keys() @@ -772,6 +842,14 @@ def gff_parser(fileobj, verbose=False, sequence_file_path=None, def check_bed3(line): + """Checks whether a line can be parsed as a BED3 line. + + Args: + line (str): a line to be checked. + + Returns: + bool: a result of the check. + """ try: record = line.strip().split('\t') if len(record) != 3: @@ -784,6 +862,14 @@ def check_bed3(line): def check_bed6(line): + """Checks whether a line can be parsed as a BED6 line. + + Args: + line (str): a line to be checked. + + Returns: + bool: a result of the check. + """ try: record = line.strip().split('\t') if len(record) != 6: @@ -796,6 +882,14 @@ def check_bed6(line): def check_bed12(line): + """Checks whether a line can be parsed as a BED12 line. + + Args: + line (str): a line to be checked. + + Returns: + bool: a result of the check. + """ try: record = line.strip().split('\t') if len(record) != 12: @@ -808,6 +902,14 @@ def check_bed12(line): def check_gtf(line): + """Checks whether a line can be parsed as a GTF line. + + Args: + line (str): a line to be checked. + + Returns: + bool: a result of the check. + """ try: record = line.strip().split('\t') if len(record) != 9: @@ -820,6 +922,14 @@ def check_gtf(line): def check_gff(line): + """Checks whether a line can be parsed as a GFF3 line. + + Args: + line (str): a line to be checked. + + Returns: + bool: a result of the check. + """ try: record = line.strip().split('\t') if len(record) != 9: @@ -832,6 +942,32 @@ def check_gff(line): def annotation_sniffer(fileobj, comment='#', separator='\t'): + """Defines the type of the annotation file. + + The function reads every line from the start until it finds + the first uncommented line. Then it defines a number of fields + and checks whether the line can be parsed as a BED3, BED6, GTF, + GFF3 or BED12 line. Finally it returns the file pointer to the + beginning of the file. + + Args: + fileobj (file): a file-like object or a stream that + contains the annotation. Must support `seek` method. + comment (str): a character used to comment lines at + their beginnings (default: '#'). + separator (str): a character used as a field separator + (default: '\t'). + + Returns: + str: a name of the annotation file type, one of + 'bed3', 'bed6', 'gtf', 'gff', 'bed12'. + + Raises: + EmptyAnnotation: in case there are no lines at all, + there are an empty line at the beginning of the file + or there are an empty line after the commented lines. + UnrecognizedFormat: in case file format can't be recognized. + """ line = fileobj.readline() while line.startswith(comment): @@ -873,6 +1009,25 @@ def annotation_sniffer(fileobj, comment='#', separator='\t'): def parse_annotation(fileobj, verbose=False, sequence_file_path=None, name_regex=None, name_tag=None, parse_attributes=False): + """Parses an annotation file. + + First, it checks an annotation type with `annotation_sniffer` and then + it parses the file according to that type. + + Args: + fileobj (file): a file-like object or a stream. + verbose (bool): if True, will report a progress bar (default: False). + sequence_file_path (str, Path): a path to the corresponding genome file + (default: None). + name_regex (r-str): a regex to extract a name of each genomic feature + (default: None). + name_tag (str): a name tag in the attributes field (default: None). + parse_attributes (bool): if True, will parse attributes field at + the expense of performance (default: False). + + Returns: + GenomicRangesList: a parsed annotation. + """ annotation_type = annotation_sniffer(fileobj) if annotation_type == 'bed3': return bed3_parser(fileobj, verbose, sequence_file_path) diff --git a/ortho2align/pipeline.py b/ortho2align/pipeline.py index 02c31c2..bc3a062 100644 --- a/ortho2align/pipeline.py +++ b/ortho2align/pipeline.py @@ -22,8 +22,14 @@ class CmdProgressBar(tqdm): - # A solution from https://github.com/tqdm/tqdm/issues/509#issuecomment-410537096 for correct nesting. + """ + Custom progress bar based on tqdm. + """ def __new__(cls, *args, **kwargs): + """Creates new instances of CMDProgessBar. + + A solution from https://github.com/tqdm/tqdm/issues/509#issuecomment-410537096 for correct nesting. + """ try: cls._instances = tqdm._instances except AttributeError: @@ -35,6 +41,14 @@ def __new__(cls, *args, **kwargs): return instance def __init__(self, cmd_hints, desc, disable=False): + """Initializes CMDProgressBar instance. + + Args: + cmd_hints (list of strs): descriptions of steps. + desc (str): description of the procedure. + disable (bool): if True, disables printing the progress bar + (default: False). + """ super().__init__(cmd_hints, bar_format="{desc}: {n_fmt}/{total_fmt}, {elapsed}{postfix}]", desc=desc, @@ -43,11 +57,33 @@ def __init__(self, cmd_hints, desc, disable=False): disable=disable) def update(self): + """Updates progress bar with the next command.""" self.postfix = self.iterable[self.n] super().update() -def bg_from_inter_ranges(genes_filename, name_regex, sample_size, output_filename, seed, silent=False): +def bg_from_inter_ranges(genes_filename, + name_regex, + sample_size, + output_filename, + seed=0, + silent=False): + """Creates background ranges from intergenic regions. + + First it merges all intersecting genomic ranges, next it + takes intergenic regions and finally it samples them. + Resulting background ranges are saved into the BED6 file. + + Args: + gene_filename (str, Path): a path to the annotation file. + name_regex (r-str): a regex to extract a name from the attributes + field of the GTF or GFF3 file. + sample_size (int): amount of background ranges to generate. + Must be smaller than amount of the intergenic regions. + output_filename (str, Path): an output filename, must be BED6. + seed (int): a random seed (default: 0). + silent (bool): if True, will suppress a progress bar (default: False). + """ random.seed(seed) cmd_hints = ['parsing the annotation...', @@ -79,6 +115,22 @@ def bg_from_inter_ranges(genes_filename, name_regex, sample_size, output_filenam def bg_from_shuffled_ranges(genes_filename, genome_filename, name_regex, sample_size, output_filename, seed): + """Generates background regions from a sample of shuffled genes. + + First it shuffles gene coordinates while preserving gene lengths, strands + and chromosomes. Next it samples them without replacement. Finally, it + saves sampled regions to the BED6 file. + + Args: + genes_filename (str, Path): an annotation filename. + genome_filename (str, Path): a genome filename (needed to get chromosome sizes). + name_regex (r-str): a regex to extract a name from the attributes + field of the GTF or GFF3 file. + sample_size (int): amount of background ranges to generate. + Must be smaller than amount of the intergenic regions. + output_filename (str, Path): an output filename, must be BED6. + seed (int): a random seed (default: 0). + """ with open(genes_filename, 'r') as infile: genes = parse_annotation(infile, name_regex=name_regex, @@ -95,6 +147,23 @@ def bg_from_shuffled_ranges(genes_filename, genome_filename, name_regex, sample_ def _align_two_ranges_blast(query, subject, **kwargs): + """A helper function to raise `GenomicRange.align_blast` to the module level. + + Aligns query and subject genomic ranges. + + Args: + query (GenomicRange): query genomic range. + subject (GenomicRange): subject genomic ranges. + kwargs (dict): kwargs sent to `GenomicRange.align_blast`. + + Returns: + GenomicRangesAlignment: a result of the alignment. + + Raises: + ExceptionLogger: in case of any exception being raised. + Saves a dict of query and subject genomic ranges as + important variables. + """ try: return query.align_blast(subject, **kwargs) except Exception as e: @@ -103,6 +172,24 @@ def _align_two_ranges_blast(query, subject, **kwargs): def _align_range_seqfile(query, seqfile, **kwargs): + """A helper function to raise `GenomicRange.align_blast_seqfile` to the module level. + + Aligns query genomic range with sequences in the file. + + Args: + query (GenomicRange): query genomic range. + seqfile (str, Path): a name of the file with sequences. + kwargs (dict): kwargs sent to `GenomicRange.align_blast_seqfile`. + + Returns: + Alignment: Aligned range pair, which associates + query genomic range and its alignment. + + Raises: + ExceptionLogger: in case of any exception being raised. + Saves a dict of query genomic range and a sequence file name as + important variables. + """ try: return query.align_blast_seqfile(seqfile, **kwargs) except Exception as e: @@ -111,6 +198,26 @@ def _align_range_seqfile(query, seqfile, **kwargs): def _estimate_bg_for_single_query_blast(query, bg_ranges, word_size, output_name, observations=1000, seed=0): + """A helper function to estimate background for a single query range. + + Given a list of background ranges, it estimates the distribution of background scores, + which is saved to a json file. + + Args: + query (GenomicRange): a query genomic range. + bg_ranges (GenomicRangesList): a collection of backgorund genomic ranges. + word_size (int): a word_size parameter to use in BLAST. + output_name (str, Path): an output filename (must be json). + observations (int): maximum number of scores to keep (default: 1000). + seed (int): random seed (default: 0). + + Returns: + tuple: of two items, output filename and amount of found scores. + + Raises: + ExceptionLogger: in case of any exception, saves query genomic range + as an important variable. + """ scores = list() try: for bg_range in bg_ranges: @@ -129,6 +236,26 @@ def _estimate_bg_for_single_query_blast(query, bg_ranges, word_size, output_name def _estimate_bg_for_single_query_seqfile(query, seqfile, word_size, output_name, observations=1000, seed=0): + """A helper function to estimate background for a single query range. + + Given a file with background sequences, it estimates the distribution of background scores, + which is saved to a json file. + + Args: + query (GenomicRange): a query genomic range. + seqfile (str, Path): a name of the file with sequences. + word_size (int): a word_size parameter to use in BLAST. + output_name (str, Path): an output filename (must be json). + observations (int): maximum number of scores to keep (default: 1000). + seed (int): random seed (default: 0). + + Returns: + int: amount of found scores. + + Raises: + ExceptionLogger: in case of any exception, saves query genomic range + as an important variable. + """ try: alignment = _align_range_seqfile(query, seqfile, @@ -159,6 +286,28 @@ def estimate_background(query_genes_filename, bg_name_regex=None, silent=False, stats_filename=None): + """Estimates background score distribution for each query gene. + + Args: + query_genes_filename (str, Path): query genes annotation filename. + bg_ranges_filename (str, Path): background ranges annotation filename. + query_genome_filename (str, Path): query genome filename. + subject_genome_filename (str, Path): subject genome filename. + outdir (str, Path): output directory name. + cores (int): how much cores to use (default: 1). + word_size (int): a word_size parameter for BLAST (default: 6). + observations (int): maximum number of background scores to keep + for each query gene (default: 1000). + seed (int): a random seed (default: 0). + query_name_regex (r-str): a regex to extract gene names from GTF/GFF3 + attributes field in a query genes annotation (default: None). + bg_name_regex (r-str): a regex to extract gene names from GTF/GFF3 + attributes field in a background ranges annotation (default: None). + silent (bool): if True, will suppress showing a progess bar (default: False). + stats_filename (str, Path): a filename to save stats about + background score distributions. If None, will print to the progress bar + (default: None). + """ cmd_hints = ['reading annotations...', 'getting sample sequences...', 'aligning samples...', @@ -228,7 +377,7 @@ def estimate_background(query_genes_filename, f"Distribution of amount of found background HSPs:\n{slplot(results)}\n" \ f"Reported at most {observations} HSPs for each transcript.\n" \ "-----------------------" - if isinstance(stats_filename, str): + if isinstance(stats_filename, str) or isinstance(stats_filename, Path): with open(stats_filename, 'w') as stats_file: pbar.write(stats_msg, file=stats_file) else: @@ -238,6 +387,7 @@ def estimate_background(query_genes_filename, def _anchor(query_genes, query_anchors, query_genome_filename, subject_anchors, subject_genome_filename, query_chromsizes, subject_chromsizes, ortho_map, neighbour_dist, merge_dist, flank_dist): + """A helper function to make syntenies from anchoring genes.""" query_anchors.relation_mapping(subject_anchors, ortho_map, 'orthologs') @@ -280,7 +430,13 @@ def _anchor(query_genes, query_anchors, query_genome_filename, def _lift(query_genes, query_genome_filename, subject_genome_filename, query_chromsizes, subject_chromsizes, liftover_chains, merge_dist, flank_dist, min_ratio=0.05): + """A helper function to make syntenies with liftOver. + Returns: + GenomicRangesList: query genes with corresponding syntenies + in 'syntenies' and 'neighbours' relations. + GenomicRangesList: a set of query genes with no syntenies. + """ query_unalignable = list() query_prepared = list() @@ -320,6 +476,23 @@ def _lift(query_genes, query_genome_filename, subject_genome_filename, query_chr def _align_syntenies(grange, word_size, outdir): + """A helper function to align genomic range's neighbourhood to the syntenic regions. + + Alignments will be saved to the outdir/`grange.name`.json. + + Args: + grange (GenomicRange): a genomic range with GenomicRangesList instances + in 'neighbours' and 'syntenies' relations. + word_size (int): a word_size to use with BLAST. + outdir (str): an output directory name. + + Returns: + int: amount of saved alignments. + + Raises: + ExceptionLogger: in case of any exception with grange as an + important variable. + """ try: neighbourhood = grange.relations['neighbours'][0] alignments = list() @@ -359,6 +532,7 @@ def get_alignments(query_genes, word_size=6, silent=False, stats_filename=None): + """A get_alignments step of the pipeline.""" program_mode = "lift" query_genes_filename = query_genes @@ -493,6 +667,26 @@ def get_alignments(query_genes, def _build(query_gene_alignments_filename, query_bg_filename, fitter, fdr, pval_threshold): + """A helper function to build orthologs from significant HSPs. + + Args: + query_gene_alignments_filename (str, Path): a query gene alignments filename. + query_bg_filename (str, Path): a query-specific background score distribution filename. + fitter (str): a fitter to use for fitting the background score distribution. One of + 'kde', 'hist'. + fdr (bool): if True, will apply FDR-BH to p-values. + pval_threshold (float): a p-value threshold. + + Returns: + list: of strs which represent bed12 records of query orthologs. + list: of strs which represent bed12 records of subject orthologs. + list: of a single query GenomicRange instance in case there are no + significant HSPs. + + Raises: + ExceptionLogger: in case of any exception with a query range + as an important variable. + """ with open(query_gene_alignments_filename, 'r') as infile: query_gene_alignments = [GenomicRangesAlignment.from_dict(dict_alignment) for dict_alignment in json.load(infile)] @@ -549,7 +743,7 @@ def build_orthologs(alignments, timeout=None, silent=False, stats_filename=None): - + """A build_orthologs step of the pipeline.""" alignments_dir = Path(alignments) background_dir = Path(background) fitting_type = fitting @@ -666,7 +860,7 @@ def get_best_orthologs(query_orthologs, outfile_query, outfile_subject, outfile_map): - + """A get_best_orthologs step of the pipeline.""" query_filename = Path(query_orthologs) subject_filename = Path(subject_orthologs) value_name = value @@ -727,7 +921,7 @@ def annotate_orthologs(subject_orthologs, output, subject_name_regex=None, stats_filename=None): - + """A annotate_orthologs step of the pipeline.""" subject_orthologs_filename = subject_orthologs subject_annotation_filename = subject_annotation output_filename = output @@ -777,75 +971,6 @@ def annotate_orthologs(subject_orthologs, print(stats_msg) -# def benchmark_orthologs(query_genes, -# query_name_regex, -# found_query, -# found_query_name_regex, -# found_subject, -# found_subject_name_regex, -# found_query_map, -# found_subject_map, -# found_query_subject_map, -# real_subject, -# real_subject_name_regex, -# real_map, -# outfile, -# tp_mode): - -# query_genes_filename = query_genes -# found_query_filename = found_query -# found_subject_filename = found_subject -# found_query_map_filename = found_query_map -# found_subject_map_filename = found_subject_map -# found_query_subject_map_filename = found_query_subject_map -# real_subject_filename = real_subject -# real_map_filename = real_map -# out_filename = outfile - -# with open(query_genes_filename, 'r') as infile: -# query_genes = parse_annotation(infile, -# name_regex=query_name_regex) - -# with open(found_query_filename, 'r') as infile: -# found_query_genes = parse_annotation(infile, -# name_regex=found_query_name_regex) - -# with open(found_subject_filename, 'r') as infile: -# found_subject_genes = parse_annotation(infile, -# name_regex=found_subject_name_regex) - -# with open(real_subject_filename, 'r') as infile: -# real_subject_genes = parse_annotation(infile, -# name_regex=real_subject_name_regex) - -# with open(found_query_map_filename, 'r') as infile: -# found_query_map = json.load(infile) - -# with open(found_subject_map_filename, 'r') as infile: -# found_subject_map = json.load(infile) - -# with open(found_query_subject_map_filename, 'r') as infile: -# found_query_subject_map = json.load(infile) - -# with open(real_map_filename, 'r') as infile: -# real_map = json.load(infile) - -# query_genes.relation_mapping(found_query_genes, found_query_map, 'found_query') -# query_genes.relation_mapping(found_subject_genes, found_subject_map, 'found_subject') -# query_genes.relation_mapping(real_subject_genes, real_map, 'real') - -# for query_gene in query_genes: -# query_gene.relations['found_query'].relation_mapping(query_gene.relations['found_subject'], -# found_query_subject_map, -# 'ortho_link') - -# trace_orthologs(query_genes) -# metrics = calc_ortholog_metrics(query_genes, tp_mode=tp_mode) - -# with open(out_filename, 'w') as outfile: -# json.dump(metrics, outfile) - - def run_pipeline(query_genes, query_genome, subject_annotation, @@ -876,7 +1001,7 @@ def run_pipeline(query_genes, seed=0, silent=False, annotate=False): - + """A method to run the whole pipeline.""" start = time.time() if not os.path.exists(outdir): os.mkdir(outdir)