-
Notifications
You must be signed in to change notification settings - Fork 1
/
3_fuse_data_and_output_results.py
executable file
·187 lines (146 loc) · 6 KB
/
3_fuse_data_and_output_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python2
#This script was used in the publication doi: https://doi.org/10.1101/271569
#to get the data shelves from the different databases and fuse them.
#
#copyright (C) 2017-2018 Martin Engqvist | [email protected]
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#LICENSE:
#
#This program is free software; you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation; either version 3 of the License, or
#(at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU Library General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software Foundation,
#Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
import sys
if sys.version_info[0] != 2:
sys.exit("Sorry, only Python 2 is supported by this script.")
import shelve
import os
from os.path import join, exists
import json
CURRENT_DIRECTORY = os.getcwd()
PROJ_ROOT_DIRECTORY = '.'
RAW_FOLDER = join(PROJ_ROOT_DIRECTORY, 'data/raw_external')
INTERMEDIATE_FOLDER = join(PROJ_ROOT_DIRECTORY, 'data/intermediate')
FINAL_FOLDER = join(PROJ_ROOT_DIRECTORY, 'data/final')
TAX_FOLDER = join(PROJ_ROOT_DIRECTORY, 'data/raw_external/taxonomy_data')
#ATCC data
s = shelve.open(join(RAW_FOLDER, 'ATCC/ATCC_parsed_cleaned.db'))
all_data = s['data']
s.close()
#DSMZ data
s = shelve.open(join(RAW_FOLDER, 'DSMZ/DSM_parsed_cleaned.db'))
all_data.extend(s['data'])
s.close()
#NCTC data
s = shelve.open(join(RAW_FOLDER, 'NCTC/NCTC_parsed_cleaned.db'))
all_data.extend(s['data'])
s.close()
#Pasteur data
s = shelve.open(join(RAW_FOLDER, 'Pasteur/Pasteur_parsed_cleaned.db'))
all_data.extend(s['data'])
s.close()
#NIES data
s = shelve.open(join(RAW_FOLDER, 'NIES/NIES_parsed_cleaned.db'))
all_data.extend(s['data'])
s.close()
#BacDive data
s = shelve.open(join(RAW_FOLDER, 'BacDive/BacDive_parsed_cleaned.db'))
all_data.extend(s['data'])
s.close()
def merge_data():
'''Collect all the information on one organism from all databases and store in shelve'''
print('Merging data')
#first find all unique organisms
all_unique = set([])
for record in all_data:
if record['name'] != None:
all_unique.add(record['name'].lower())
out_data = {}
for org in all_unique:
out_data[org] = {}
out_data[org]['temp'] = []
out_data[org]['taxid'] = []
out_data[org]['lineage'] = []
out_data[org]['medium'] = []
out_data[org]['db'] = []
out_data[org]['domain'] = []
out_data[org]['risk'] = []
#for each shelve, poupulate
for record in all_data:
org = record['name'].lower()
out_data[org]['temp'].append(record['temp'])
out_data[org]['taxid'].append(record['taxid'])
out_data[org]['lineage'].append(record['lineage'])
out_data[org]['medium'].append(record['medium'])
out_data[org]['db'].append(record['db'])
out_data[org]['domain'].append(record['domain'])
out_data[org]['risk'].append(record['risk'])
s = shelve.open(join(FINAL_FOLDER, 'all_data.db'))
s['data'] = out_data
s.close()
print('Done')
def output_temperature_flatfile():
'''Make flatfile for temperatures in .tsv format as well as a json dump'''
s = shelve.open(join(FINAL_FOLDER, 'all_data.db'))
data = s['data']
s.close()
json_data = {}
#make average of the temperatures
for org in data.keys():
if org == 'bacteriophage lambda':
continue
#skip nones
temps = [float(s) for s in data[org]['temp'] if s is not None]
#skip enpty dictionaries
if temps == []:
data[org]['temp'] = None
continue
#make average
data[org]['temp'] = int(round( sum(temps)/float(len(temps)) ))
#make sure there is only one kind of taxid for each organism
if len(set(data[org]['taxid'])) == 1:
data[org]['taxid'] = data[org]['taxid'][0]
else:
raise ValueError
#domain
if len(set(data[org]['domain'])) == 1:
data[org]['domain'] = data[org]['domain'][0]
else:
raise ValueError
#lineage text
if all([data[org]['lineage'][0] == s for s in data[org]['lineage']]):
data[org]['lineage'] = data[org]['lineage'][0]
else:
print(data[org]['lineage'])
#lineage number
data[org]['lineage_taxid'] = 'test'
#write to json file
with open(join(FINAL_FOLDER, 'temperature_data.xml'), 'w') as f:
json.dump(data, f)
with open(join(FINAL_FOLDER, 'temperature_data.tsv'), 'w') as f:
f.write('organism\tdomain\ttemperature\ttaxid\tlineage_text\tsuperkingdom\tphylum\tclass\torder\tfamily\tgenus\n')
for org in sorted(data.keys()):
if org == 'bacteriophage lambda':
continue
temperature = data[org]['temp']
if temperature is None:
continue
domain = data[org]['domain']
taxid = data[org]['taxid']
lineage_text = data[org]['lineage']['lineage_text']
lineage_number = data[org]['lineage']['lineage_taxid']
f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (org.replace('.', '').replace(' ', '_'), domain, temperature, taxid, lineage_text, lineage_number['superkingdom'], lineage_number['phylum'], lineage_number['class'], lineage_number['order'], lineage_number['family'], lineage_number['genus']))
merge_data()
output_temperature_flatfile()