-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_elfi_simulator.py
163 lines (126 loc) · 5.27 KB
/
run_elfi_simulator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from run_elfi import gen_distances_elfi
from simulate_divergence import read_distfile
import numpy as np
import argparse
import os
import sys
from functools import partial
from multiprocessing import Pool
import tqdm
import matplotlib.pyplot as plt
rng = np.random.default_rng()
def get_options():
description = 'Run simulator of gene gain model'
parser = argparse.ArgumentParser(description=description,
prog='python run_elfi_simulator.py')
IO = parser.add_argument_group('Input/Output options')
IO.add_argument('--distfile',
required=True,
help='popPUNK distance file to fit to. ')
IO.add_argument('--params',
required=True,
help='Parameters file to run simulation.')
IO.add_argument('--outpref',
default="sim",
help='Output prefix. Default = "sim"')
IO.add_argument('--threads',
type=int,
default=1,
help='Number of threads. Default = 1')
return parser.parse_args()
def run_sim(index, params_list, max_real_core, max_hamming_core):
param_set = params_list[index]
size_pan = int(param_set[0])
size_core = int(param_set[1])
avg_gene_freq = float(param_set[2])
base_mu = param_set[3]
n_gen = int(param_set[4])
pop_size = int(param_set[5])
ratio_gene_gl = float(param_set[6])
gene_gl_speed = float(param_set[7])
prop_gene = float(param_set[8])
base_mu = [float(i) for i in base_mu.split(",")]
# set evenly spaced core hamming values across generations. Divide by two as pairwise divergence, final generation has no mutations
core_mu = (max_real_core / (n_gen - 1)) / 2
# round to 6 dp
base_mu = [round(i, 6) for i in base_mu]
# ensure probabilities sum to 1
if sum(base_mu) != 1:
base_mu[-1] = 1 - sum(base_mu[0:3])
base_mu1 = base_mu[0]
base_mu2 = base_mu[1]
base_mu3 = base_mu[2]
base_mu4 = base_mu[3]
core_site_mu1 = 0.25
core_site_mu2 = 0.25
core_site_mu3 = 0.25
core_site_mu4 = 0.25
dist_mat, avg_core, avg_acc = gen_distances_elfi(size_core, size_pan, core_mu, avg_gene_freq, ratio_gene_gl, gene_gl_speed, prop_gene,
base_mu1, base_mu2, base_mu3, base_mu4, core_site_mu1, core_site_mu2, core_site_mu3,
core_site_mu4, pop_size, n_gen, max_real_core, max_hamming_core, True)
#dist_mat[:, 0] = dist_mat[:, 0] * max_hamming_core
#dist_mat[:, 1] = dist_mat[:, 1] * max_jaccard_acc
return index, dist_mat, avg_core, avg_acc
if __name__ == "__main__":
# distfile = "distances/GPSv4_distances_sample1.txt"
# threads = 4
# paramsfile = "parameter_example_test.txt"
# outpref = "test"
options = get_options()
threads = options.threads
distfile = options.distfile
paramsfile = options.params
outpref = options.outpref
# read in real files
df = read_distfile(distfile)
# detemine highest core hamming distance, convert to real space using Jukes-Cantor
max_hamming_core = float(df["Core"].max())
max_jaccard_acc = float(df["Accessory"].max())
max_real_core = (-3/4) * np.log(1 - (4/3 * max_hamming_core))
params_list = []
with open(paramsfile, "r") as f:
next(f)
for line in f:
params_list.append(line.rstrip().split("\t"))
print("Running Simulations...")
with Pool(processes=threads) as pool:
for index, dist_mat, avg_core, avg_acc in tqdm.tqdm(pool.imap(
partial(run_sim, params_list=params_list, max_real_core=max_real_core, max_hamming_core=max_hamming_core),
range(0, len(params_list))), total=len(params_list)):
# save to file
np.savetxt(outpref + "_results_sim_" + str(index + 1) + ".csv", dist_mat, delimiter=",")
avg_mat = np.zeros((avg_core.shape[0], 2))
avg_mat[:, 0] = avg_core
avg_mat[:, 1] = avg_acc
np.savetxt(outpref + "_averages_sim_" + str(index + 1) + ".csv", avg_mat, delimiter=",")
fig, ax = plt.subplots()
x = df["Core"]
y = df["Accessory"]
ax.scatter(x, y, label="Real")
x = np.array(dist_mat[:, 0])
y = np.array(dist_mat[:, 1])
ax.scatter(x, y, label="Sim " + str(index + 1))
ax.set_xlabel("Core Hamming")
ax.set_ylabel("Accessory Jaccard")
fig.savefig(outpref + "_" + "sim_" + str(index + 1) + ".png")
ax.clear()
plt.clf
plt.cla
# plot average distances over time
y = avg_core
ax.plot(y)
ax.set_xlabel("Generation")
ax.set_ylabel("Average Core Hamming")
fig.savefig(outpref + "_" + "avg_core_sim_" + str(index + 1) + ".png")
ax.clear()
plt.clf
plt.cla
# plot average distances over time
y = avg_acc
ax.plot(y)
ax.set_xlabel("Generation")
ax.set_ylabel("Average Acc Jaccard")
fig.savefig(outpref + "_" + "avg_acc_sim_" + str(index + 1) + ".png")
ax.clear()
plt.clf
plt.cla