Skip to content

Commit

Permalink
v0.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
asistradition committed Mar 10, 2022
1 parent 8d2865b commit 5e06482
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 16 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### Version 0.3.3

* Correctly produced a TF binding table when `--save_location_data` is set.

### Version 0.3.2

* Corrected a parsing error when reading CisBP PWM files
Expand Down
42 changes: 33 additions & 9 deletions inferelator_prior/network_from_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,9 @@ def main():
fuzzy_motif_names=args.fuzzy,
motif_info=_minfo,
shuffle=args.shuffle,
lowmem=args.lowmem,
intergenic_only=_intergenic)
lowmem=not args.highmem,
intergenic_only=_intergenic,
save_locs=args.save_locs)

print("Prior matrix with {n} edges constructed".format(n=prior_matrix.sum().sum()))

Expand Down Expand Up @@ -98,8 +99,10 @@ def add_common_arguments(argp):
const=True, default=False)
argp.add_argument("--shuffle", dest="shuffle", help="Shuffle motif PWMs using SEED", metavar="SEED",
const=42, default=None, action='store', nargs='?', type=int)
argp.add_argument("--lowmem", dest="lowmem", help="Run in low memory mode", action='store_const',
const=True, default=False)
argp.add_argument("--lowmem", dest="lowmem", help="DEPRECATED: Run in low memory mode (replaced by --highmem)",
action='store_const', const=True, default=False)
argp.add_argument("--highmem", dest="highmem", help="Run in high memory mode (no performance benefits)",
action='store_const', const=True, default=False)


def parse_common_arguments(args):
Expand Down Expand Up @@ -155,7 +158,7 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
truncate_prob=0.35, scanner_thresh="1e-4", motif_format="meme",
gene_constraint_list=None, regulator_constraint_list=None,
output_prefix=None, debug=False, fuzzy_motif_names=False, motif_info=None,
shuffle=None, lowmem=False, intergenic_only=True):
shuffle=None, lowmem=False, intergenic_only=True, save_locs=False):
"""
Build a motif-based prior from windows around annotated genes.
Expand Down Expand Up @@ -205,11 +208,16 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
:type lowmem: bool
:param intergenic_only: Only scan intergenic regions for regulatory motifs
:type intergenic_only: bool
:param save_locs: Save motif mapping positions to a file, Defaults to False
:type save_locs: bool
:return prior_matrix, raw_matrix, prior_data: Filtered connectivity matrix, unfiltered score matrix, and unfiltered
long dataframe with scored TF->Gene pairs and genomic locations
:rtype: pd.DataFrame, pd.DataFrame, pd.DataFrame
"""

if save_locs:
save_locs = output_prefix + "_tf_binding_locs.tsv"

# PROCESS GENE ANNOTATIONS #########################################################################################

print("Loading genes from file ({f})".format(f=annotation_file))
Expand Down Expand Up @@ -249,7 +257,8 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
raw_matrix, prior_data = network_scan(motifs, motif_information, genes, genomic_fasta_file,
constraint_bed_file=constraint_bed_file, promoter_bed_file=gene_locs,
scanner_type=scanner_type, scanner_thresh=scanner_thresh,
num_cores=num_cores, motif_ic=motif_ic, tandem=tandem, debug=debug)
num_cores=num_cores, motif_ic=motif_ic, tandem=tandem, debug=debug,
save_locs=save_locs)

# PROCESS SCORES INTO NETWORK ##################################################################################
print("{n} regulatory edges identified by motif search".format(n=(raw_matrix != 0).sum().sum()))
Expand Down Expand Up @@ -281,19 +290,30 @@ def network_scan_build_single_tf(tf_mi_df):
if motif_peaks is not None:
ra_ma, pr_da = summarize_target_per_regulator(genes, motif_peaks, tf_mi_df, num_workers=1, debug=debug,
silent=True)

else:
ra_ma = pd.DataFrame(0.0, index=genes[GTF_GENENAME],
columns=tf_mi_df[MOTIF_NAME_COL].unique().tolist())
pr_da = None

return network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True)
if save_locs:
return network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True), motif_peaks
else:
return network_build(ra_ma, pr_da, num_cores=1, output_prefix=None, debug=debug, silent=True)

# MULTIPROCESS PER-TF ##########################################################################################
prior_matrix, raw_matrix, prior_data = [], [], []

with pathos.multiprocessing.Pool(processes=num_cores, maxtasksperchild=50) as pool:
motif_information = [df for _, df in motif_information.groupby(MOTIF_NAME_COL)]
for i, (p_m, r_m, p_d) in enumerate(pool.imap_unordered(network_scan_build_single_tf, motif_information)):
for i, res in enumerate(pool.imap_unordered(network_scan_build_single_tf, motif_information)):

if save_locs:
res[1].to_csv(save_locs, sep="\t", mode="w" if i == 0 else "a", header=i == 0)
res = res[0]

p_m, r_m, p_d = res

print("Processed TF {i}/{n}".format(i=i, n=len(motif_information)))
prior_matrix.append(p_m)
raw_matrix.append(r_m)
Expand Down Expand Up @@ -361,7 +381,7 @@ def _renamer(s):

def network_scan(motifs, motif_information, genes, genomic_fasta_file, constraint_bed_file=None,
promoter_bed_file=None, scanner_type='fimo', num_cores=1, motif_ic=6, tandem=100,
scanner_thresh="1e-4", debug=False, silent=False):
scanner_thresh="1e-4", debug=False, silent=False, save_locs=False):
# Load and scan target chromatin peaks
MotifScan.set_type(scanner_type)

Expand All @@ -379,6 +399,10 @@ def network_scan(motifs, motif_information, genes, genomic_fasta_file, constrain
for chromosome, df in motif_peaks[MotifScan.chromosome_col].value_counts().iteritems():
print("Chromosome {c}: {n} motif hits".format(c=chromosome, n=df))

if save_locs:
print(f"Writing output file {save_locs}")
motif_peaks.to_csv(save_locs, sep="\t")

# PROCESS CHROMATIN PEAKS INTO NETWORK MATRIX ######################################################################

# Process into an information score matrix
Expand Down
13 changes: 9 additions & 4 deletions inferelator_prior/network_from_motifs_fasta.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from inferelator_prior.network_from_motifs import (add_common_arguments, parse_common_arguments,
load_and_process_motifs, network_build)
from inferelator_prior.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, get_fasta_lengths, select_genes
from inferelator_prior.processor.prior import summarize_target_per_regulator, MotifScorer, PRIOR_TF, PRIOR_GENE
from inferelator_prior.processor.gtf import GTF_GENENAME, get_fasta_lengths, select_genes
from inferelator_prior.processor.prior import summarize_target_per_regulator, MotifScorer
from inferelator_prior.motifs.motif_scan import MotifScan

import argparse
Expand Down Expand Up @@ -36,14 +36,15 @@ def main():
debug=args.debug,
fuzzy_motif_names=args.fuzzy,
motif_info=_minfo,
shuffle=args.shuffle)
shuffle=args.shuffle,
save_locs=args.save_locs)


def build_motif_prior_from_fasta(motif_file, promoter_fasta_file, scanner_type='fimo', num_cores=1, motif_ic=6,
tandem=100, truncate_prob=0.35, scanner_thresh="1e-4", motif_format="meme",
gene_constraint_list=None, regulator_constraint_list=None,
output_prefix=None, debug=False, fuzzy_motif_names=False, motif_info=None,
shuffle=None):
shuffle=None, save_locs=False):
"""
Build a motif-based prior from promoter sequences extracted into a FASTA.
Expand Down Expand Up @@ -109,6 +110,10 @@ def build_motif_prior_from_fasta(motif_file, promoter_fasta_file, scanner_type='
genes = select_genes(genes, gene_constraint_list)
motif_peaks = motif_peaks.loc[motif_peaks[MotifScan.chromosome_col].isin(genes[GTF_GENENAME]), :]

if save_locs:
save_locs = output_prefix + "_tf_binding_locs.tsv"
motif_peaks.to_csv(save_locs, sep="\t")

# PROCESS SCORES INTO NETWORK ######################################################################################
print("Processing TF binding sites into prior")
MotifScorer.set_information_criteria(min_binding_ic=motif_ic, max_dist=tandem)
Expand Down
53 changes: 51 additions & 2 deletions inferelator_prior/tests/test_network_maker.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import unittest
import pandas.testing as pdt
import tempfile

from inferelator_prior.motifs import meme, select_motifs, truncate_motifs
from inferelator_prior.processor import gtf
Expand All @@ -10,7 +11,8 @@

artifact_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "artifacts")
data_path = os.path.join(os.path.abspath(os.path.expanduser(os.path.dirname(__file__))), "../../data/")

temppath = tempfile.TemporaryDirectory(prefix="ip_test_")
temp_path_prefix = os.path.join(temppath.name, "prior")

class TestConstraints(unittest.TestCase):

Expand Down Expand Up @@ -91,8 +93,55 @@ def test_full_stack_network_build(self):
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False)
intergenic_only=False,
output_prefix=None)

self.assertEqual(cut.sum().sum(), 3)
self.assertListEqual(cut[cut["GAL4"]].index.tolist(), ["YBR018C", "YBR019C", "YBR020W"])
self.assertEqual((raw > 0).sum().sum(), 3)

def test_file_output(self):
cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix)

self.assertTrue(os.path.exists(temp_path_prefix + "_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "_edge_matrix.tsv.gz"))
self.assertFalse(os.path.exists(temp_path_prefix + "_tf_binding_locs.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "b",
save_locs=True)

self.assertTrue(os.path.exists(temp_path_prefix + "b_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "b_edge_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "b_tf_binding_locs.tsv"))

cut, raw, _ = build_motif_prior_from_genes(os.path.join(artifact_path,
"test_gal4.meme"),
os.path.join(artifact_path,
"Saccharomyces_cerevisiae.R64-1-1.GAL_OPERON.gtf"),
os.path.join(data_path,
"Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa"),
window_size=(500, 100),
intergenic_only=False,
output_prefix=temp_path_prefix + "c",
save_locs=True,
lowmem=True)

self.assertTrue(os.path.exists(temp_path_prefix + "c_unfiltered_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "c_edge_matrix.tsv.gz"))
self.assertTrue(os.path.exists(temp_path_prefix + "c_tf_binding_locs.tsv"))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

install_requires = ["numpy", "pandas>=1.0", "HTSeq", "pybedtools", "scipy", "pathos", "sklearn", "tqdm"]
tests_require = ["coverage", "nose", "pysam"]
version = "0.3.2"
version = "0.3.3"

# Description from README.md
base_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down

0 comments on commit 5e06482

Please sign in to comment.