-
Notifications
You must be signed in to change notification settings - Fork 3
/
create_referencebase.py
213 lines (143 loc) · 8.15 KB
/
create_referencebase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
# -*- coding: utf-8 -*-
################################################################################
##Create Referencebase##
""""
Author(s); Lindsay Hopson and Charles Hadley King
Date: 04-16-2021
Title of program/source code: Create Referencebase
Code version:3
Type: Python 3.8.0
Availibility:<https://github.com/GW-HIVE/microbiome>
note: This version combined CensuScope_code1.3 and CensuScope_code2.2
This code will go through all your CensuScope output files, grab all unique accessions from the files, check to
see if any of the unique accessions are in the Blacklist (downloaded from https://hive.biochemistry.gwu.edu/gfkb),
removes all the accession that are in the blacklist, then outputs a list of unique accessions (which will be used
to create the reference base required for the Hexagon computations. Biopython package is then used to extract
reference information from NCBI on the list of unique organisms. Non-bacteria organisms are first filtered out.
The organims with incomplete genomes are filtered out and saved to a seperate CSV file for further manual curation.
Output: This code will produce 3 CSV files. The first CSV file contains all unique organisms extracted from
CensuScope output CSV files (include all organims (bacteria, viral, host, ect)). The second CSV file contains only
bacteria with complete genomes (which is the filtered list). The third CSV file contains bacteria that do not have
complete genomes. These bacteria will have to be manually curated to select its most representative complete genome.
"""
################################################################################
#Imports
import os, re, csv
import time
from Bio import Entrez
#Declarations
my_dir = os.getcwd()
term = "dnaAccessionBasedResult" #identifier of all CensuScope output files
gutDB = '/Users/lindsayhopson/Downloads/BMM/GutFeelingKnowledgeBase-v3.csv' #use your appropriate file path
thresh = 10
fileList = []
list_Census = []
blacklist = '/Users/lindsayhopson/Downloads/BMM/blackList-v2.0.csv' #use your appropriate file path
Entrez.email = '[email protected]'#input your email
bacteria_completeGenome = []
not_bacteria = []
bacteriaNot_completeGenome = []
#______________________________________________________________________________#
#GOAL of method: Creates a list of file paths to each of the CensuScope outputs. Each file is identified by the ending term of the file name
def createFileList(my_dir, term):
for subdir, dirs, files in os.walk(my_dir):
for file in files:
if re.search(re.escape(term), file):
result = os.path.join(subdir, file)
fileList.append(result)
return fileList
#______________________________________________________________________________#
#GOAL of method: Takes an file path, empty list, the blackList as a csv, and threshold as an input and returns a list with the accession numbers that are unique to the file
#Accession numbers from the CensuScope output files that are below the threshold are ignored
#Accession numbers from the CensuScope output files that meet the threshold are compared to the accession numbers in Black_List
#The accessions that are not present in the Black_List are added to a new list called list_Census
def readCensusresultFile (file, list_Census, blacklist, threshold):
sample = file.split('_')[-1].split('.')[0] #not really sure what this does but its neccesary for the code to work
#the has something to so with identifying the file name
with open (file, 'r') as read:
reader = csv.reader(read)
for cell in reader:
if cell[0] == 'accession': continue
if cell[0] == 'n/a': continue
accession = cell[0].split('.')[0].split('*')[-1]
if accession in blacklist: continue
if int(cell[5]) < threshold: continue
if accession in list_Census: continue
else: list_Census.append(accession)
return list_Census
#______________________________________________________________________________#
#GOAL of method: Takes the unique list of organisms pulled from all the CensuScope output file and extracts their reference info from NCBI.
#The reference info is then put into a csv file.
def ncbi_ref_info(orgList):
with open('unfiltered_unique_orgs.csv', 'w') as output_File:
for i in orgList:
handle = Entrez.esummary(db ="nucleotide", id=i)
record = Entrez.read(handle)
a = str(record[0]['Title'])
b = str(record[0]['TaxId'])
a = a.replace(',',' ')
handle = Entrez.efetch( db ="taxonomy", id=b)
record = Entrez.read(handle)
c = record[0]['Lineage']
output_File.write(str(i + ","+ a + ','+ c +'\n'))
return 'unfiltered_unique_orgs.csv'
#--------------------------------------------------------#
#Goal of method: Filter through the unique accessions list and remove non-bacteria and incomplete genomes. Save the filtered list as csv
#The filtering process puts all removed organims in a list.
#This was done to that, if needed, one could retrieve the excat organims removes with few minor changes to this method.
def completeBacteria(fileIn):
with open ('complete_bacteria.csv', 'w') as output_File:
with open (fileIn, 'r') as read:
reader = csv.reader(read)
for cell in reader:
Genbank_Accession = cell[0]
Lineage = cell[2].split('; ')
Organism_Name = cell[1]
if cell[2] == 'Lineage': continue
if cell[1] == "Organism Name": continue
if "Bacteria" in Lineage:
if "genome" in Organism_Name:
if "complete genome" in Organism_Name:
bacteria_completeGenome.append(Genbank_Accession)
if "draft genome" in Organism_Name:
bacteriaNot_completeGenome.append(Genbank_Accession)
else: bacteria_completeGenome.append(Genbank_Accession)
else: bacteriaNot_completeGenome.append(Genbank_Accession)
else: not_bacteria.append(Genbank_Accession)
for i in bacteria_completeGenome:
handle = Entrez.esummary(db ="nucleotide", id=i)
record = Entrez.read(handle)
a = str(record[0]['Title'])
b = str(record[0]['TaxId'])
a = a.replace(',',' ')
handle = Entrez.efetch( db ="taxonomy", id=b)
record = Entrez.read(handle)
c = record[0]['Lineage']
output_File.write(str(i + ","+ a + ','+ c +'\n'))
return bacteriaNot_completeGenome
#-------------------------------------------------------------------------#
# GOAL of method: Produces a csv file containing bacteria with incomplete genomes, which will have to be manually curated to find most represenatitve genome.
def manualQC(not_complete_genomes):
with open('bacteria_incomplete_genomes.csv', 'w') as output_File:
for i in not_complete_genomes:
handle = Entrez.esummary(db ="nucleotide", id=i)
record = Entrez.read(handle)
a = str(record[0]['Title'])
b = str(record[0]['TaxId'])
a = a.replace(',',' ')
handle = Entrez.efetch( db ="taxonomy", id=b)
record = Entrez.read(handle)
c = record[0]['Lineage']
output_File.write(str(i + ","+ a + ','+ c +'\n'))
return bacteriaNot_completeGenome
#--------------------------------------------------------------------------#
def main ():
fileList = createFileList(my_dir, term)
for i in fileList:
sample_list = readCensusresultFile(i, list_Census, blacklist, thresh)
in_file = ncbi_ref_info(sample_list)
needQC_list = completeBacteria(in_file)
manualQC(needQC_list)
#______________________________________________________________________________#
if __name__ == '__main__': main()