-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmimic.py
150 lines (113 loc) · 5.82 KB
/
mimic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
'''
MIMIC Software
@Authors:
Ryan Doughty, Iva Kotaskova, Kasambula Arthur Shem, Shwetha Kumar, Mike Nute, Todd Treangen, Mike Nute
@Version 0.1
Mimic creates simulated metagenomes based off of real existing metagenomes.
'''
import argparse
import os
import pathlib
import subprocess
from src.tax_identification import run_bracken, run_kraken2, run_lemur, parse_magnet_output
from src.sim import generate_species_file_info, prep_sim_lemur, run_read_analysis, run_sim
__author__ = "Ryan Doughty, Iva Kotaskova, Kasambula Arthur Shem, Shwetha Kumar, Mike Nute, Todd Treangen"
__contact__ = "[email protected]"
__license__ = "MIT"
__version__ = "0.1"
__email__ = "[email protected]"
__status__ = "Development"
def print_info():
"""
Prints tool information
"""
print('----------------------------------------')
print(f'MIMIC Software Version {__version__}')
print('----------------------------------------')
print(f'Contact: {__contact__}')
print(f'Authors: {__author__}\n')
print(f'Status: {__status__}\n\n')
def initialize_working(working:str):
"""Initializes working directory, creates necessary sub directories"""
if os.path.exists(working):
raise SystemExit('Initializing Working Directory Failed, working directory already exists')
else:
os.mkdir(working)
lemur = os.path.join(working, 'lemur')
os.mkdir(lemur)
magnet = os.path.join(working, 'magnet')
os.mkdir(magnet)
nanosim = os.path.join(working, 'nanosim')
os.mkdir(nanosim)
simulated_data = os.path.join(working, 'simulated_data')
os.mkdir(simulated_data)
print('Initialized Working Directory\n')
def run_mimic(args):
"""
Main pipeline function for the MIMIC
"""
fastq1 = args.fastq
fastq2 = args.fastq2
output = args.output
threads = args.threads
lemur_db = args.db
num_reads = args.reads
simulate_only = args.simulate_only
perfect = args.perfect
nanosim_loc = os.path.join(output, 'nanosim')
lemur_out = os.path.join(output, 'lemur')
magnet_out = os.path.join(output, 'magnet')
if not simulate_only:
initialize_working(output)
## run lemur
report_loc = run_lemur(fastq1, lemur_db, lemur_out, threads=threads)
report_loc = os.path.join(lemur_out, 'relative_abundance.tsv')
if fastq2 is not None:
subprocess.run(['python', 'magnet/magnet.py',
'-c', report_loc,
'-i', fastq1,
'-I', fastq2,
'-o', magnet_out,
'--threads', str(threads)], check=True)
else:
subprocess.run(['python', 'magnet/magnet.py',
'-c', report_loc,
'-i', fastq1,
'-o', magnet_out,
'-a', '12',
'--threads', str(threads)], check=True)
magnet_report = os.path.join(magnet_out, 'cluster_representative.csv')
if not os.path.exists(magnet_report):
raise SystemExit('Magnet failed')
## get necessary files for the nanosim input and run nanosim
genome_list = prep_sim_lemur(magnet_report, report_loc, nanosim_loc, output, num_reads)
species_info = generate_species_file_info(genome_list, nanosim_loc)
genome_list_loc = os.path.join(nanosim_loc, 'genome_list1.tsv')
run_read_analysis(fastq1, genome_list_loc, nanosim_loc, threads=threads) ## nanosim step 1
genome_list_loc = os.path.join(nanosim_loc, 'genome_list2.tsv')
abundances = os.path.join(nanosim_loc, 'abundances.tsv')
species_loc = os.path.join(nanosim_loc, 'species_info.tsv')
run_sim(genome_list_loc, abundances, species_loc, nanosim_loc, perfect=perfect, threads=threads) ## nanosim step 2
##concatenate the two fasta files and shuffle results (TODO: quick and dirty solution, will fix)
subprocess.run(f'cat {output}/nanosim/*.fasta > {output}/nanosim/simulated.fasta', shell=True, check=True)
subprocess.run(f'rm {output}/nanosim/simulated_sample*.fasta ', shell=True, check=True)
subprocess.run(f'mv {output}/nanosim/simulated.fasta {output}/simulated_data/', shell=True, check=True)
if not perfect:
subprocess.run(f'mv {output}/nanosim/simulated_sample* {output}/simulated_data/error_data', shell=True, check=True)
else:
subprocess.run(f'rm {output}/nanosim/simulated_sample*', shell=True, check=True)
def parse_args():
parser = argparse.ArgumentParser(description="Universal Taxonomic Classification Verifier.")
parser.add_argument("-i", "--fastq", type=pathlib.Path, required=True, help="Path to first fastq file")
parser.add_argument("-I", "--fastq2", type=pathlib.Path, required=False, help="Path to second fastq file for paired-end reads")
parser.add_argument("-o", "--output", type=pathlib.Path, required=True, help="Path to the output directory.")
parser.add_argument("--db", type=str, required=True, help='Kraken2 database location')
parser.add_argument('-t', '--threads', type=int, required=False, default=1, help='Number of threads for multithreading (Default: 1)')
parser.add_argument('-r', '--reads', type=int, required=True, default=100, help='Number of simulated reads to generate')
parser.add_argument('--simulate-only', action='store_true', help='Only runs simulation (must have already run pipeline on sample once, will override existing simulated data)')
parser.add_argument('--perfect', action='store_true', help='Will generate perfect reads with no errors')
args = parser.parse_args()
run_mimic(args)
if __name__=='__main__':
print_info()
parse_args()