Skip to content

Commit

Permalink
parallelization, improved accuracy calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
jalhackl committed Aug 2, 2023
1 parent f6db2dd commit ad86e4e
Show file tree
Hide file tree
Showing 7 changed files with 3,428 additions and 16 deletions.
77 changes: 77 additions & 0 deletions sstar/archie_infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import sstar
import utils
import os
import demes
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import nbinom

import train
import infer
import preprocess

import argparse, os, sys, signal

#def main(args=None):
def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, model_name):

ref_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".ref.ind.list"
tgt_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".tgt.ind.list"

#ref_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "ref.scr1.list")
#tgt_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "sim.src1.list")
scikitfile = output_prefix + ".scikit.pickle"
statsmodelsfile = output_prefix + ".statsmodels.pickle"

final_folders = infer.get_all_folders(model_name, os.path.join("nref_" + str(nref), "ntgt_" + str(ntgt)))

sample_name = "nref_" + str(nref) + "_ntgt_" + str(ntgt)

#infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
#without ref_ and tgt_ind_file (are created within infer)
infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--demo_model_file',type=str, required=True)
parser.add_argument('--nrep', type=int, required=True)
parser.add_argument('--nref', type=int, required=True)
parser.add_argument('--ntgt',type=int, required=True)
parser.add_argument('--ref_id',type=str, required=True)
parser.add_argument('--tgt_id', type=str, required=True)
parser.add_argument('--src_id', type=str, required=True)
parser.add_argument('--seq_len', type=int, required=True)
parser.add_argument('--mut_rate',type=float, required=True)
parser.add_argument('--rec_rate',type=float, required=True)
parser.add_argument('--thread',type=int, required=True)
parser.add_argument('--output_prefix',type=str, required=True)
parser.add_argument('--output_dir',type=str, required=True)
parser.add_argument('--seed',required=True)
parser.add_argument('--model_name',type=str,required=True)

args = parser.parse_args()

demo_model_file = args.demo_model_file
nrep = args.nrep
nref = args.nref
ntgt = args.ntgt
ref_id = args.ref_id
tgt_id = args.tgt_id
src_id = args.src_id
seq_len = args.seq_len
mut_rate = args.mut_rate
rec_rate = args.rec_rate
thread = args.thread
output_prefix = args.output_prefix
output_dir = args.output_dir
if args.seed == "None":
seed = None
else:
seed = int(args.seed)
model_name=args.model_name


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed,model_name)
77 changes: 77 additions & 0 deletions sstar/archie_infer_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import sstar
import utils
import os
import demes
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import nbinom

import train
import infer
import preprocess

import argparse, os, sys, signal

#def main(args=None):
def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, model_name):

ref_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".ref.ind.list"
tgt_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".tgt.ind.list"

#ref_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "ref.scr1.list")
#tgt_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "sim.src1.list")
scikitfile = output_prefix + ".scikit.pickle"
statsmodelsfile = output_prefix + ".statsmodels.pickle"

final_folders = infer.get_all_folders(model_name, os.path.join("nref_" + str(nref), "ntgt_" + str(ntgt)))

sample_name = "nref_" + str(nref) + "_ntgt_" + str(ntgt)

#infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
#without ref_ and tgt_ind_file (are created within infer)
infer.predict_introgression_folders_allmodeltypes(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--demo_model_file',type=str, required=True)
parser.add_argument('--nrep', type=int, required=True)
parser.add_argument('--nref', type=int, required=True)
parser.add_argument('--ntgt',type=int, required=True)
parser.add_argument('--ref_id',type=str, required=True)
parser.add_argument('--tgt_id', type=str, required=True)
parser.add_argument('--src_id', type=str, required=True)
parser.add_argument('--seq_len', type=int, required=True)
parser.add_argument('--mut_rate',type=float, required=True)
parser.add_argument('--rec_rate',type=float, required=True)
parser.add_argument('--thread',type=int, required=True)
parser.add_argument('--output_prefix',type=str, required=True)
parser.add_argument('--output_dir',type=str, required=True)
parser.add_argument('--seed',required=True)
parser.add_argument('--model_name',type=str,required=True)

args = parser.parse_args()

demo_model_file = args.demo_model_file
nrep = args.nrep
nref = args.nref
ntgt = args.ntgt
ref_id = args.ref_id
tgt_id = args.tgt_id
src_id = args.src_id
seq_len = args.seq_len
mut_rate = args.mut_rate
rec_rate = args.rec_rate
thread = args.thread
output_prefix = args.output_prefix
output_dir = args.output_dir
if args.seed == "None":
seed = None
else:
seed = int(args.seed)
model_name=args.model_name


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed,model_name)
77 changes: 77 additions & 0 deletions sstar/archie_infer_haplotypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import sstar
import utils
import os
import demes
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import nbinom

import train
import infer
import preprocess

import argparse, os, sys, signal

#def main(args=None):
def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, model_name):

ref_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".ref.ind.list"
tgt_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".tgt.ind.list"

#ref_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "ref.scr1.list")
#tgt_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "sim.src1.list")
scikitfile = output_prefix + ".scikit.pickle"
statsmodelsfile = output_prefix + ".statsmodels.pickle"

final_folders = infer.get_all_folders(model_name, os.path.join("nref_" + str(nref), "ntgt_" + str(ntgt)))

sample_name = "nref_" + str(nref) + "_ntgt_" + str(ntgt)

#infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
#without ref_ and tgt_ind_file (are created within infer)
infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False, use_haplotype_acc=True)


if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--demo_model_file',type=str, required=True)
parser.add_argument('--nrep', type=int, required=True)
parser.add_argument('--nref', type=int, required=True)
parser.add_argument('--ntgt',type=int, required=True)
parser.add_argument('--ref_id',type=str, required=True)
parser.add_argument('--tgt_id', type=str, required=True)
parser.add_argument('--src_id', type=str, required=True)
parser.add_argument('--seq_len', type=int, required=True)
parser.add_argument('--mut_rate',type=float, required=True)
parser.add_argument('--rec_rate',type=float, required=True)
parser.add_argument('--thread',type=int, required=True)
parser.add_argument('--output_prefix',type=str, required=True)
parser.add_argument('--output_dir',type=str, required=True)
parser.add_argument('--seed',required=True)
parser.add_argument('--model_name',type=str,required=True)

args = parser.parse_args()

demo_model_file = args.demo_model_file
nrep = args.nrep
nref = args.nref
ntgt = args.ntgt
ref_id = args.ref_id
tgt_id = args.tgt_id
src_id = args.src_id
seq_len = args.seq_len
mut_rate = args.mut_rate
rec_rate = args.rec_rate
thread = args.thread
output_prefix = args.output_prefix
output_dir = args.output_dir
if args.seed == "None":
seed = None
else:
seed = int(args.seed)
model_name=args.model_name


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed,model_name)
137 changes: 137 additions & 0 deletions sstar/archie_only_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import sstar
import utils
import os
import demes
import numpy as np
import pandas as pd
from scipy.stats import norm
from scipy.stats import nbinom

import train
import infer
import preprocess

import argparse, os, sys, signal
import shutil


def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions, create_testdirs = False, train_file_name=None):

if train_file_name == None:
train_file_name = str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile.csv"

train_df = pd.read_csv(train_file_name)
#drop_dynamic_cols indicate whether non-fixed size features should be dropped
train_df_reduced = train_df.copy()

train_df_no_kurtosis = train_df.copy()
train_df_no_paired_dist = train_df.copy()

train_df_target_full_reduced = train_df.copy()



dynamic_cols = [col for col in train_df.columns if ('-ton' in col or col.startswith("pairwised_dist"))]

no_kurtosis_cols = [col for col in train_df.columns if ('kurtosis_pairwised_dist' in col or col.startswith("pairwised_dist")) ]

no_paired_cols = [col for col in train_df.columns if (col.startswith("pairwised_dist"))]

full_reduced_cols = [col for col in train_df.columns if ('-ton' in col or 'pairwised_dist' in col )]

train_df_reduced.drop(dynamic_cols, axis=1, inplace = True, errors='ignore')

train_df_no_kurtosis.drop(no_kurtosis_cols, axis=1, inplace = True, errors='ignore')
train_df_no_paired_dist.drop(no_paired_cols, axis=1, inplace = True, errors='ignore')
train_df_target_full_reduced.drop(full_reduced_cols, axis=1, inplace = True, errors='ignore')



#train_df.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile.csv")

train_df_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_fixed.csv")

train_df_no_kurtosis.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nokurtosis.csv")
train_df_no_paired_dist.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nopaired.csv")
train_df_target_full_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_tgtfullreduced.csv")


scikit_file = output_prefix + ".scikit.pickle"
statsmodels_file = output_prefix + ".statsmodels.pickle"
scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"

scikit_file_no_kurtosis = "nokurt_" + output_prefix + ".scikit.pickle"
statsmodels_file_no_kurtosis = "nokurt_" + output_prefix + ".statsmodels.pickle"

scikit_file_no_paired_dist = "nopaired_" + output_prefix + ".scikit.pickle"
statsmodels_file_no_paired_dist = "nopaired_" + output_prefix + ".statsmodels.pickle"

scikit_file_full_reduced = "fullreduced_" + output_prefix + ".scikit.pickle"
statsmodels_file_full_reduced = "fullreduced_" + output_prefix + ".statsmodels.pickle"

scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"

#call training functions

train.train_statsmodels(train_df, statsmodels_file)

train.train_scikit(train_df, scikit_file)
train.train_statsmodels(train_df_reduced, statsmodels_file_reduced)
train.train_scikit(train_df_reduced, scikit_file_reduced)

train.train_statsmodels(train_df_no_kurtosis, statsmodels_file_no_kurtosis)
train.train_scikit(train_df_no_kurtosis, scikit_file_no_kurtosis)

train.train_statsmodels(train_df_no_paired_dist, statsmodels_file_no_paired_dist)
train.train_scikit(train_df_no_paired_dist, scikit_file_no_paired_dist)

train.train_statsmodels(train_df_target_full_reduced, statsmodels_file_full_reduced)
train.train_scikit(train_df_target_full_reduced, scikit_file_full_reduced)



if __name__ == "__main__":

parser = argparse.ArgumentParser()
parser.add_argument('--demo_model_file',type=str, required=True)
parser.add_argument('--nrep', type=int, required=True)
parser.add_argument('--nref', type=int, required=True)
parser.add_argument('--ntgt',type=int, required=True)
parser.add_argument('--ref_id',type=str, required=True)
parser.add_argument('--tgt_id', type=str, required=True)
parser.add_argument('--src_id', type=str, required=True)
parser.add_argument('--seq_len', type=int, required=True)
parser.add_argument('--mut_rate',type=float, required=True)
parser.add_argument('--rec_rate',type=float, required=True)
parser.add_argument('--thread',type=int, required=True)
parser.add_argument('--output_prefix',type=str, required=True)
parser.add_argument('--output_dir',type=str, required=True)
parser.add_argument('--seed',required=True)

parser.add_argument('--folder_partitions',type=int,required=True)

args = parser.parse_args()
demo_model_file = args.demo_model_file
nrep = args.nrep
nref = args.nref
ntgt = args.ntgt
ref_id = args.ref_id
tgt_id = args.tgt_id
src_id = args.src_id
seq_len = args.seq_len
mut_rate = args.mut_rate
rec_rate = args.rec_rate
thread = args.thread
output_prefix = args.output_prefix
output_dir = args.output_dir
if args.seed == "None":
seed = None
else:
seed = int(args.seed)

folder_partitions = args.folder_partitions


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions)
Loading

0 comments on commit ad86e4e

Please sign in to comment.