Skip to content

Commit

Permalink
functions removed, comments added,...
Browse files Browse the repository at this point in the history
  • Loading branch information
jalhackl committed Aug 17, 2023
1 parent ad86e4e commit d5e6b74
Show file tree
Hide file tree
Showing 9 changed files with 1,412 additions and 10,868 deletions.
7 changes: 3 additions & 4 deletions sstar/archie_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,17 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut
ref_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".ref.ind.list"
tgt_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".tgt.ind.list"

#ref_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "ref.scr1.list")
#tgt_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "sim.src1.list")
scikitfile = output_prefix + ".scikit.pickle"
statsmodelsfile = output_prefix + ".statsmodels.pickle"

#get all folders for prediction
final_folders = infer.get_all_folders(model_name, os.path.join("nref_" + str(nref), "ntgt_" + str(ntgt)))

sample_name = "nref_" + str(nref) + "_ntgt_" + str(ntgt)

#without ref_ and tgt_ind_file (are created within infer.predict_introgression_folders)
#infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
#without ref_ and tgt_ind_file (are created within infer)

infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)


Expand Down Expand Up @@ -73,5 +73,4 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut
seed = int(args.seed)
model_name=args.model_name


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed,model_name)
77 changes: 0 additions & 77 deletions sstar/archie_infer_haplotypes.py

This file was deleted.

137 changes: 0 additions & 137 deletions sstar/archie_only_train.py

This file was deleted.

42 changes: 10 additions & 32 deletions sstar/archie_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,94 +17,73 @@

def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions, create_testdirs = False):


#this variable determines how many predicition-directories are created within one folder
nrep_per_folder = int(nrep / folder_partitions)


train_df_list = []
for i in range(folder_partitions):
curr_output_dir = output_dir + str(i)

preprocess.store_global_parameters(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir)
#create a training folder for the training set
if not os.path.exists(curr_output_dir):
os.makedirs(curr_output_dir)

#and for the test set
if create_testdirs == True:
if not os.path.exists(curr_output_dir + "test"):
os.makedirs(curr_output_dir + "test")

train._simulation_manager(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, seed)


new_train_df = train._train_archie_return_df(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, drop_dynamic_cols=False)
new_train_df = train._train_archie(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, drop_dynamic_cols=False)

train_df_list.append(new_train_df)


#after appending the features to the dataframe, the folder continaing training examples is deleted
shutil.rmtree(curr_output_dir)

#the full train dataframe
train_df = pd.concat(train_df_list)

#create also reduced dfs

#drop_dynamic_cols indicate whether non-fixed size features should be dropped
#create reduced dfs for training on reduced data sets
train_df_reduced = train_df.copy()

train_df_no_kurtosis = train_df.copy()
train_df_no_paired_dist = train_df.copy()

train_df_target_full_reduced = train_df.copy()



#drop_dynamic_cols indicate whether non-fixed size features should be dropped etc
dynamic_cols = [col for col in train_df.columns if ('-ton' in col or col.startswith("pairwised_dist"))]

no_kurtosis_cols = [col for col in train_df.columns if ('kurtosis_pairwised_dist' in col or col.startswith("pairwised_dist")) ]

no_paired_cols = [col for col in train_df.columns if (col.startswith("pairwised_dist"))]

full_reduced_cols = [col for col in train_df.columns if ('-ton' in col or 'pairwised_dist' in col )]

train_df_reduced.drop(dynamic_cols, axis=1, inplace = True, errors='ignore')

train_df_no_kurtosis.drop(no_kurtosis_cols, axis=1, inplace = True, errors='ignore')
train_df_no_paired_dist.drop(no_paired_cols, axis=1, inplace = True, errors='ignore')
train_df_target_full_reduced.drop(full_reduced_cols, axis=1, inplace = True, errors='ignore')



#reduced dataframes
train_df.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile.csv")

train_df_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_fixed.csv")

train_df_no_kurtosis.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nokurtosis.csv")
train_df_no_paired_dist.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nopaired.csv")
train_df_target_full_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_tgtfullreduced.csv")


#names for models
scikit_file = output_prefix + ".scikit.pickle"
statsmodels_file = output_prefix + ".statsmodels.pickle"
scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"

scikit_file_no_kurtosis = "nokurt_" + output_prefix + ".scikit.pickle"
statsmodels_file_no_kurtosis = "nokurt_" + output_prefix + ".statsmodels.pickle"

scikit_file_no_paired_dist = "nopaired_" + output_prefix + ".scikit.pickle"
statsmodels_file_no_paired_dist = "nopaired_" + output_prefix + ".statsmodels.pickle"

scikit_file_full_reduced = "fullreduced_" + output_prefix + ".scikit.pickle"
statsmodels_file_full_reduced = "fullreduced_" + output_prefix + ".statsmodels.pickle"

scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"

#call training functions

train.train_statsmodels(train_df, statsmodels_file)

train.train_scikit(train_df, scikit_file)

train.train_statsmodels(train_df_reduced, statsmodels_file_reduced)
train.train_scikit(train_df_reduced, scikit_file_reduced)

Expand Down Expand Up @@ -160,5 +139,4 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut

folder_partitions = args.folder_partitions


main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions)
Loading

0 comments on commit d5e6b74

Please sign in to comment.