-
Notifications
You must be signed in to change notification settings - Fork 0
/
ResCap_analysis.py
executable file
·123 lines (89 loc) · 3.76 KB
/
ResCap_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python3
import subprocess
import glob
import sys
import os
import pandas
import argparse
#Parsing the script, description and selection of initila file and number of cores
parser = argparse.ArgumentParser(description=""
"ResCap_analysis_1.0\n"
"Hello world!\t"
"This script allows you to analyze the results from the ResCap capture platform \t"
"It performs an on-target selection, then a subsampling to normalize the samples and "
"erase outliers, and then a selection of the resistance genes with KMA\t"
"\t\tLet's hope it works! ¯\_(ツ)_/¯\t\t")
parser.add_argument("-c","--clusters",default=1, help= 'Number of clusters available to use')
parser.add_argument("-t","--table", help='CSV files (comma separated) with two columns, called R1 and R2')
args = parser.parse_args()
cores = args.clusters
cores=str(cores)
print('number of cores =', cores)
scriptpath = os.path.dirname(sys.argv[0])
bowtie_db = '/dbs/bowtie_db/plataforma'
bowtie_db_path= scriptpath+bowtie_db
current_directory1 = os.getcwd()
extractR1R2 = '/*R1.fastq.gz'
loop_extract_path = current_directory1+extractR1R2
if args.table is not None:
table_name = args.table
df = pandas.read_csv(table_name, sep=',',
dtype=str)
dict_of_samples = (df.set_index('R1')['R2'].to_dict())
for key, value in dict_of_samples.items():
salida_t = key+'.bow.gz'
subprocess.call(['bowtie2', '-x', bowtie_db_path, '-p', cores, '-1', key, '-2', value, '-q', '--al-conc-gz', salida_t])
else:
for R1 in glob.iglob(loop_extract_path):
R2 = R1.replace('R1', 'R2')
salida = R1.replace('.R1.fastq.gz', '.bow.gz')
subprocess.call(['bowtie2','-x',bowtie_db_path,'-p',cores,'-1',R1, '-2', R2,'-q', '--al-conc-gz', salida])
dict_of_reads={}
list_of_reads=[]
seqkit='/bin/seqkit'
seqkit_path = scriptpath+seqkit
cmd= "{0} stats *b* -T -j {1} -o Tabla.txt".format(seqkit_path, cores)
subprocess.call(cmd, shell=True)
df = pandas.read_csv('Tabla.txt', sep='\t', dtype=str)
df1 = df[['file','num_seqs']]
df1.num_seqs = df1.num_seqs.astype(float)
dict_of_reads = (df1.set_index('file')['num_seqs'].to_dict())
for key, value in dict_of_reads.items():
list_of_reads.append(value)
lowest_num = min(list_of_reads)
lowest_num = str(int(lowest_num))
sum=0
for num in list_of_reads:
sum = sum +num
average= sum/len(list_of_reads)
outliers_value_norm=(average*0.1)
outliers_value=int(average*0.1)
outliers_value=str(outliers_value)
outliers_list=[]
outliers_value_list=[]
for key, value in dict_of_reads.items():
if value <= outliers_value_norm:
outliers_list.append(key)
outliers_value_list.append(value)
os.makedirs('./outliers')
for f in outliers_list:
subprocess.call(['mv', f,'./outliers'])
current_directory1 = os.getcwd()
extract_b = '/*.bow*'
loop_extract_b_path = current_directory1+extract_b
seqkit='/bin/seqkit'
seqkit_path = scriptpath+seqkit
for original in glob.iglob(loop_extract_b_path):
salida1 = original.replace('.bow.', '.kma.')
subprocess.call([seqkit_path, 'sample', original, '-n', lowest_num, '-j', cores, '-o', salida1])
current_directory1 = os.getcwd()
extract_c = '/*c.1.gz'
loop_extract_c_path = current_directory1+extract_c
kma = '/bin/kma/kma'
kma_path= scriptpath+kma
kma_db = '/dbs/kma_db/data-base-kma'
kma_db_path= scriptpath+kma_db
for c1 in glob.iglob(loop_extract_c_path):
c2 = c1.replace('.1', '.2')
salida1 = c1.replace('.kma.1.gz', '')
subprocess.call([kma_path,'-ipe', c1, c2, '-o', salida1, '-t_db', kma_db_path, '-t', cores])