functions removed, comments added,...

xin-huang · Aug 17, 2023 · d5e6b74 · d5e6b74
1 parent ad86e4e
commit d5e6b74
Show file tree

Hide file tree

Showing 9 changed files with 1,412 additions and 10,868 deletions.
diff --git a/sstar/archie_infer.py b/sstar/archie_infer.py
@@ -19,17 +19,17 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut
     ref_ind_file = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".ref.ind.list"
     tgt_ind_file  = str(demo_model_file) + "_new_sim" + "_nref" + str(nref) + "_ntgt" + str(ntgt) + ".tgt.ind.list"
 
-    #ref_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "ref.scr1.list")
-    #tgt_ind_file = os.path.join("config", "simulation", "nref_" + str(nref), "ntgt_" + str(ntgt), "sim.src1.list")
     scikitfile = output_prefix + ".scikit.pickle"
     statsmodelsfile = output_prefix + ".statsmodels.pickle"
 
+    #get all folders for prediction
     final_folders = infer.get_all_folders(model_name, os.path.join("nref_" + str(nref), "ntgt_" + str(ntgt)))
 
     sample_name = "nref_" + str(nref) + "_ntgt_" + str(ntgt)
 
+    #without ref_ and tgt_ind_file (are created within infer.predict_introgression_folders)
     #infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
-    #without ref_ and tgt_ind_file (are created within infer)
+
     infer.predict_introgression_folders(nrep, nref, ntgt, seq_len, thread, output_prefix+ "test", final_folders, statsmodel=statsmodelsfile, scikitmodel=scikitfile, sample_name=sample_name, ref_ind_file=ref_ind_file, tgt_ind_file=tgt_ind_file, model_name=model_name, drop_dynamic_cols=False, evaluate=False, simulated=True, average_for_inference=False, compute_cutoffs=True, win_step_50k = False)
 
 
@@ -73,5 +73,4 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut
         seed = int(args.seed)
     model_name=args.model_name
 
-
     main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed,model_name)
diff --git a/sstar/archie_infer_haplotypes.py b/sstar/archie_infer_haplotypes.py
diff --git a/sstar/archie_only_train.py b/sstar/archie_only_train.py
diff --git a/sstar/archie_train.py b/sstar/archie_train.py
@@ -17,94 +17,73 @@
 
 def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions, create_testdirs = False):
 
-
+    #this variable determines how many predicition-directories are created within one folder
     nrep_per_folder = int(nrep / folder_partitions)
 
-
     train_df_list = []
     for i in range(folder_partitions):
         curr_output_dir = output_dir + str(i)
 
         preprocess.store_global_parameters(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir)
+        #create a training folder for the training set
         if not os.path.exists(curr_output_dir):
             os.makedirs(curr_output_dir)
 
-        #and for the test set
-        if create_testdirs == True:
-            if not os.path.exists(curr_output_dir + "test"):
-                os.makedirs(curr_output_dir + "test")    
-
         train._simulation_manager(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, seed)
 
-
-        new_train_df = train._train_archie_return_df(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, drop_dynamic_cols=False)
+        new_train_df = train._train_archie(demo_model_file, nrep_per_folder, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, curr_output_dir, drop_dynamic_cols=False)
 
         train_df_list.append(new_train_df)
 
-
+        #after appending the features to the dataframe, the folder continaing training examples is deleted
         shutil.rmtree(curr_output_dir)
 
+    #the full train dataframe
     train_df = pd.concat(train_df_list)
 
-    #create also reduced dfs
-
-    #drop_dynamic_cols indicate whether non-fixed size features should be dropped
+    #create reduced dfs for training on reduced data sets
     train_df_reduced = train_df.copy()
-
     train_df_no_kurtosis = train_df.copy()
     train_df_no_paired_dist = train_df.copy()
-
     train_df_target_full_reduced = train_df.copy()
 
-
-
+    #drop_dynamic_cols indicate whether non-fixed size features should be dropped etc
     dynamic_cols = [col for col in train_df.columns if ('-ton' in col or col.startswith("pairwised_dist"))]
-
     no_kurtosis_cols = [col for col in train_df.columns if ('kurtosis_pairwised_dist' in col or col.startswith("pairwised_dist")) ]
-
     no_paired_cols = [col for col in train_df.columns if (col.startswith("pairwised_dist"))]
-
     full_reduced_cols = [col for col in train_df.columns if ('-ton' in col or 'pairwised_dist' in col )]
 
     train_df_reduced.drop(dynamic_cols, axis=1, inplace = True, errors='ignore')
-
     train_df_no_kurtosis.drop(no_kurtosis_cols, axis=1, inplace = True, errors='ignore')
     train_df_no_paired_dist.drop(no_paired_cols, axis=1, inplace = True, errors='ignore')
     train_df_target_full_reduced.drop(full_reduced_cols, axis=1, inplace = True, errors='ignore')
 
-
-
+    #reduced dataframes
     train_df.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile.csv")
-
     train_df_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_fixed.csv")
-
     train_df_no_kurtosis.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nokurtosis.csv")
     train_df_no_paired_dist.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_nopaired.csv")
     train_df_target_full_reduced.to_csv(str(demo_model_file) + "_nref" + str(nref) + "_ntgt" + str(ntgt) + "_finalfeaturefile_tgtfullreduced.csv")
 
-
+    #names for models
     scikit_file = output_prefix + ".scikit.pickle"
     statsmodels_file = output_prefix + ".statsmodels.pickle"
     scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
     statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"
-
     scikit_file_no_kurtosis = "nokurt_" + output_prefix + ".scikit.pickle"
     statsmodels_file_no_kurtosis = "nokurt_" + output_prefix + ".statsmodels.pickle"
-
     scikit_file_no_paired_dist = "nopaired_" + output_prefix + ".scikit.pickle"
     statsmodels_file_no_paired_dist = "nopaired_" + output_prefix + ".statsmodels.pickle"
-
     scikit_file_full_reduced = "fullreduced_" + output_prefix + ".scikit.pickle"
     statsmodels_file_full_reduced = "fullreduced_" + output_prefix + ".statsmodels.pickle"
-
     scikit_file_reduced = "fixed_" + output_prefix + ".scikit.pickle"
     statsmodels_file_reduced = "fixed_" + output_prefix + ".statsmodels.pickle"
 
     #call training functions
 
     train.train_statsmodels(train_df, statsmodels_file)
-
     train.train_scikit(train_df, scikit_file)
+
     train.train_statsmodels(train_df_reduced, statsmodels_file_reduced)
     train.train_scikit(train_df_reduced, scikit_file_reduced)
 
@@ -160,5 +139,4 @@ def main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut
 
     folder_partitions = args.folder_partitions
 
-
     main(demo_model_file, nrep, nref, ntgt, ref_id, tgt_id, src_id, seq_len, mut_rate, rec_rate, thread, output_prefix, output_dir, seed, folder_partitions)