-
Notifications
You must be signed in to change notification settings - Fork 3
/
convert_to_perlodes.py
172 lines (144 loc) · 8.25 KB
/
convert_to_perlodes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import PySimpleGUI as sg
import pandas as pd
from pandas import DataFrame
import numpy as np
from pathlib import Path
import sys, subprocess, os
def convert_to_perlodes(TaXon_table_xlsx, operational_taxon_list, path_to_outdirs, meta_data_to_test):
def open_table(table):
if sys.platform == "win32":
os.startfile(table)
else:
opener = "open" if sys.platform == 'darwin' else 'xdg-open'
subprocess.call([opener, table])
#get the taxonomy from the operational taxon list
operational_taxon_list_df = pd.read_excel(Path(operational_taxon_list), header=2, sheet_name="Operationelle Taxaliste")
taxonomy_list = operational_taxon_list_df["Taxonname\n(Perlodes-Datenbank)"].values.tolist()
taxonomy_list = [x for x in taxonomy_list if str(x) != 'nan']
# get the according IDs from the operational taxon list
IDs_list = operational_taxon_list_df["ID_\nART"].values.tolist()
IDs_list = [x for x in IDs_list if str(x) != 'nan']
# create a dict to store both the ID and the taxonomy
operational_taxon_list_dict = {}
for ID, taxonomy in zip(IDs_list, taxonomy_list):
operational_taxon_list_dict[taxonomy] = int(ID)
# load the taxon table and create a list
TaXon_table_xlsx = Path(TaXon_table_xlsx)
TaXon_table_df = pd.read_excel(TaXon_table_xlsx)
TaXon_table_taxonomy = TaXon_table_df.columns.tolist()[0:7]
samples_list = TaXon_table_df.columns.tolist()[10:]
## load the metadata -> freshwater type
Meta_data_table_xlsx = Path(str(path_to_outdirs) + "/" + "Meta_data_table" + "/" + TaXon_table_xlsx.stem + "_metadata.xlsx")
Meta_data_table_df = pd.read_excel(Meta_data_table_xlsx, header=0).fillna("nan")
Meta_data_table_samples = Meta_data_table_df['Samples'].tolist()
metadata_loc = Meta_data_table_df.columns.tolist().index(meta_data_to_test)
types_list = Meta_data_table_df[meta_data_to_test].values.tolist()
## drop samples with metadata called nan (= empty)
drop_samples = [i[0] for i in Meta_data_table_df.values.tolist() if i[metadata_loc] == "nan"]
if drop_samples != []:
sg.PopupError("Please fill out all the metadata for all samples.")
elif sorted(Meta_data_table_samples) == sorted(samples_list):
## use the metadata sample sorting
samples_list = sorted(Meta_data_table_samples)
# store hits and dropped OTUs
hit_list, dropped_list, transversion_list = [], [], []
# loop through the taxon table
for taxonomy in TaXon_table_df[TaXon_table_taxonomy].drop_duplicates().values.tolist():
# collect the OTU name, species, genus and family and convert to perlodes format
OTU = taxonomy[0]
Species = taxonomy[6]
Species_group = str(taxonomy[6]) + "-Gr."
Genus = str(taxonomy[5]) + " sp."
Family = str(taxonomy[4]) + " Gen. sp."
# test if the OTU has a hit at: Species level, Genus level or Family level
if Species in operational_taxon_list_dict.keys():
# add to hit list
hit_list.append([OTU] + [str(operational_taxon_list_dict[Species])] + [Species])
# add to perlodes log file
transversion_list.append(taxonomy + [str(operational_taxon_list_dict[Species])] + [Species])
elif Species_group in operational_taxon_list_dict.keys():
hit_list.append([OTU] + [str(operational_taxon_list_dict[Species_group])] + [Species_group])
transversion_list.append(taxonomy + [str(operational_taxon_list_dict[Species_group])] + [Species_group])
elif Genus in operational_taxon_list_dict.keys():
hit_list.append([OTU] + [str(operational_taxon_list_dict[Genus])] + [Genus])
transversion_list.append(taxonomy + [str(operational_taxon_list_dict[Genus])] + [Genus])
elif Family in operational_taxon_list_dict.keys():
hit_list.append([OTU] + [str(operational_taxon_list_dict[Family])] + [Family])
transversion_list.append(taxonomy + [str(operational_taxon_list_dict[Family])] + [Family])
# otherwise store the hit with an "nan"
else:
hit_list.append([OTU] + ["nan"])
dropped_list.append(OTU)
transversion_list.append(taxonomy + ["", ""])
# create an output list for perlodes
# make read abundaces binary
perlodes_input_list = []
for hit, row in zip(hit_list, TaXon_table_df[samples_list].values.tolist()):
# skip OTUs that were not in the OPT
if hit[1] != "nan":
reads_list = []
# loop through all the sample of the file
for reads in row:
# reads > 0 --> 1
if reads > 0:
reads_list.append(1)
# reads == 0 --> 0
else:
reads_list.append(0)
# now append the taxonomy and the presence/absence to the perlodes list
perlodes_input_list.append([hit[1]] + [hit[2]] + reads_list)
# calculate the number of dropped OTUs
number_of_initial_OTUs = len(TaXon_table_df[TaXon_table_taxonomy].drop_duplicates().values.tolist())
# write the perlodes output file
perlodes_df = pd.DataFrame(perlodes_input_list)
perlodes_df.columns = ["ID_ART", "TAXON_NAME"] + samples_list
# Perlocdes sums up the counts of all df_duplicates! So these must be removed
perlodes_df.index = perlodes_df["TAXON_NAME"]
# create a set of all present taxa
taxa_set = set(perlodes_df["TAXON_NAME"].values.tolist())
perlodes_filtered_list = []
# loop through all target taxa
for taxon in taxa_set:
# collect the ID and Taxon name
ID_taxon_name = perlodes_df.loc[taxon][["ID_ART", "TAXON_NAME"]].values.tolist()[0]
# if there are duplicates:
try:
# calculate the sum for each sample
sum_of_pa = list(perlodes_df.loc[taxon][samples_list].sum())
# start a new list for each taxon
pa_list = ID_taxon_name
# replace the count by either 1 or 0
for pa in sum_of_pa:
if pa > 0:
pa_list.append(1)
else:
pa_list.append(0)
perlodes_filtered_list.append(pa_list)
# if there are no duplicates
except:
# simply take the old presence/absence line
perlodes_filtered_list.append(perlodes_df.loc[taxon].values.tolist())
## create header rows for the input file
a = [["Gewässertyp", ""] + types_list]
b = [["Taxaliste", ""] + ["original"] * len(samples_list)]
c = [["Nutzung", ""] + ["keine"] * len(samples_list)]
# write the filtered list to a dataframe
perlodes_df = pd.DataFrame(a + b + c + perlodes_filtered_list)
perlodes_df.columns = ["ID_ART", "TAXON_NAME"] + samples_list
perlodes_directory = Path(str(path_to_outdirs) + "/" + "Perlodes" + "/" + TaXon_table_xlsx.stem)
perlodes_xlsx = Path(str(perlodes_directory) + "_perlodes.xlsx")
perlodes_df.to_excel(perlodes_xlsx, sheet_name='ImportList', index=False)
# write the log file file to a different dataframe
transversion_df = pd.DataFrame(transversion_list)
transversion_df.columns = ["ID", "Phylum", "Class", "Order", "Family", "Genus", "Species", "ID_ART", "TAXON_NAME"]
transversion_xlsx = Path(str(perlodes_directory) + "_perlodes_conversion_table.xlsx")
transversion_df.to_excel(transversion_xlsx, index=False)
closing_text = "Perlodes input file is found under:\n" + '/'.join(str(perlodes_xlsx).split("/")[-4:])
info_text = "\n\nDropped " + str(len(dropped_list)) + " of " + str(number_of_initial_OTUs) + " OTUs.\n"
answer = sg.PopupYesNo(closing_text + info_text + "\nOpen table?")
if answer == "Yes":
open_table(perlodes_xlsx)
from taxontabletools.create_log import ttt_log
ttt_log("perlodes conversion", "processing", TaXon_table_xlsx.name, perlodes_xlsx.name, "nan", path_to_outdirs)
else:
sg.PopupError("Error: The samples from the metadata table do not match!")