cbg-ethz · LaraFuhrmann · Sep 6, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ data. It is designed to analyse genetically heterogeneous samples. Its tools
 are written in different programming languages and provide error correction,
 haplotype reconstruction and estimation of the frequency of the different
 genetic variants present in a mixed sample.
+VILOCA takes an alignment file as input, and subsequently generates mutation calls and local haplotypes.
+
 
 ---
 
@@ -53,6 +55,13 @@ There are several parameters available:
 
 `--windowsize`: In case no insert file is provided, the genome is tiled into uniform local regions. `windowsize` determines the length of those local regions. It should be of roughly the length of the reads. This is also the length of the haplotypes that are produced.
 
+### Output
+`haplotypes` This directory contains the reconstructed local haplotypes as separate fasta files per local region.
+
+`coverage.txt` List of each local region with start and end positions, and number of reads considered in the region.
+
+`cooccurring_mutations.csv` All mutation calls including the information on which haplotype it occurred on.
+
 ## Development/CI with Docker
 The following command in the root directory will let you interact with the project locally through Docker.
 ```bash

diff --git a/tests/test_long_deletions.py b/tests/test_long_deletions.py
@@ -16,8 +16,8 @@ def test_long_deletions():
 
     # Input data
     bamfile = "test_aln.cram"
-    snvsfile = "snv/SNV.tsv"
-    outfile = "snv/SNVs_0.010000.tsv"
+    snvsfile = "work/snv/SNV.tsv"
+    outfile = "work/snv/SNVs_0.010000.tsv"
 
     helper_long_deletions.main(bamfile=bamfile, snvsfile=snvsfile, outfile=outfile)
 

diff --git a/tests/test_pooled_post.py b/tests/test_pooled_post.py
@@ -24,7 +24,7 @@ def open_side_effect(name):
 
 def test__ingest_sampler_results_gamma_theta():
     with patch("builtins.open", mock_open(read_data="bla\n#gamma = 0.967662\n\nlolz\n#theta = 0.88\neof")):
-        out = pooled_post._ingest_sampler_results_gamma_theta(open("debug/dbg.dbg"), "shorah")
+        out = pooled_post._ingest_sampler_results_gamma_theta(open("work/debug/dbg.dbg"), "shorah")
 
         np.testing.assert_almost_equal(out[0][0], -0.032872426234558716)
         np.testing.assert_almost_equal(out[0][1], -3.431512269664515)

diff --git a/tests/test_shotgun_e2e.py b/tests/test_shotgun_e2e.py
@@ -5,7 +5,7 @@
 import shutil
 
 cwd = "./data_1"
-f_path = "raw_reads/w-HXB2-2804-3004.extended-ref.fas"
+f_path = "work/raw_reads/w-HXB2-2804-3004.extended-ref.fas"
 base_files = []
 
 @pytest.fixture(scope="session", autouse=True)
@@ -35,7 +35,7 @@ def test_e2e_shorah():
 
     assert filecmp.cmp(
         "./data_1/test.csv",
-        "./data_1/snv/SNVs_0.010000_final.csv",
+        "./data_1/work/snv/SNVs_0.010000_final.csv",
         shallow=False
     )
 

diff --git a/viloca/b2w.py b/viloca/b2w.py
@@ -737,6 +737,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
         reference_filename=reference_filename,
         threads=max_proc #1
     )
+    # print this message since pysam print confusing error message with the functions above.
+    print("Index file was created successfully.")
     #reffile = pysam.FastaFile(reference_filename) --> we need to read it in each child processo
     #counter = 0 #--> counter is now coputed initially for all windows
     reference_name = tiling_strategy.get_reference_name()

diff --git a/viloca/cli.py b/viloca/cli.py
@@ -183,7 +183,7 @@ def main():
 
     parser_shotgun.add_argument("--n_max_haplotypes", metavar='INT', type=int,
                                 required=False, default=100, dest="n_max_haplotypes",
-                                help="Guess of maximal guess of haplotypes.")
+                                help="Guess of maximal guess of haplotypes. If VILOCA returns the maximal number of haplotypes then this number was choosen to little and needs to be increased.")
 
     parser_shotgun.add_argument("--conv_thres", metavar='FLOAT', type=float,
                             required=False, default=1e-03, dest="conv_thres",

diff --git a/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py b/viloca/local_haplotype_inference/learn_error_params/run_dpm_mfa.py
@@ -11,7 +11,7 @@
 from . import cavi
 
 logging.basicConfig(
-    filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
+    filename="viloca.log", encoding="utf-8", level=logging.INFO
 )
 
 

diff --git a/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py b/viloca/local_haplotype_inference/use_quality_scores/run_dpm_mfa.py
@@ -13,7 +13,7 @@
 from . import cavi
 
 logging.basicConfig(
-    filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
+    filename="viloca.log", encoding="utf-8", level=logging.INFO
 )
 
 

diff --git a/viloca/shorah_snv.py b/viloca/shorah_snv.py
@@ -203,7 +203,7 @@ def parseWindow(line, extended_window_mode, exclude_non_var_pos_threshold,
     _, chrom, beg, end, _ = line.rstrip().split("\t")
 
     file_stem = "w-%s-%s-%s" % (chrom, beg, end)
-    haplo_filename = os.path.join(working_dir, "support", file_stem + ".reads-support.fas")
+    haplo_filename = os.path.join(working_dir, "haplotypes", file_stem + ".reads-support.fas")
 
     if extended_window_mode:
         ref_name = "extended-ref"

diff --git a/viloca/shotgun.py b/viloca/shotgun.py
@@ -521,7 +521,7 @@ def main(args):
     logging.debug("[shotgun] All processes completed successfully.")
 
     # prepare directories
-    for sd_name in ['debug', 'sampling', 'freq', 'support',
+    for sd_name in ['debug', 'haplotypes', 'freq', 'sampling', 'work',
                     'corrected', 'raw_reads', 'inference']:
         try:
             os.mkdir(sd_name)
@@ -555,7 +555,7 @@ def main(args):
     move_files_into_dir("debug", glob.glob("./w*dbg"))
     move_files_into_dir("sampling", glob.glob("./w*smp"))
     move_files_into_dir("corrected", glob.glob("./w*reads-cor.fas"))
-    move_files_into_dir("support", glob.glob("./w*reads-support.fas"))
+    move_files_into_dir("haplotypes", glob.glob("./w*reads-support.fas"))
     move_files_into_dir("freq", glob.glob("./w*reads-freq.csv"))
     move_files_into_dir("sampling", glob.glob("./w*smp"))
     raw_reads_files = glob.glob('./w*reads.fas') + glob.glob('./w*ref.fas') + glob.glob('./w*qualities.npy')
@@ -626,15 +626,15 @@ def main(args):
                 envp_post.post_process_for_envp(
                     open(f"raw_reads/{stem}.envp-full-ref.fas"),
                     open(f"raw_reads/{stem}.envp-ref.fas"),
-                    f"support/{stem}.reads-support.fas",
-                    f"support/{stem}.reads-support.fas" # overwrite
+                    f"haplotypes/{stem}.reads-support.fas",
+                    f"haplotypes/{stem}.reads-support.fas" # overwrite
                 )
 
     # Pooled
     b_list = args.b.copy()
     if len(b_list) > 1:
         for idx, i in enumerate(b_list):
-            Path(f"sample{idx}/support").mkdir(parents=True, exist_ok=True)
+            Path(f"sample{idx}/haplotypes").mkdir(parents=True, exist_ok=True)
             Path(f"sample{idx}/corrected").mkdir(parents=True, exist_ok=True)
             with open("coverage.txt") as cov:
                 for line in cov:
@@ -652,16 +652,16 @@ def main(args):
                         f"raw_reads/{stem}.ref.fas",
                         filtered_reads.name,
                         open(f"debug/{stem}.dbg") if inference_type == "shorah" else open(f"inference/{stem}.reads-all_results.pkl", "rb"),
-                        open(f"support/{stem}.reads-support.fas"),
+                        open(f"haplotypes/{stem}.reads-support.fas"),
                         filtered_cor_reads_path,
                         inference_type,
                         None if inference_type != "use_quality_scores" else f"raw_reads/{stem}.qualities.npy" # TODO untested
                     )
                     filtered_reads.close()
 
                     pooled_post.write_support_file_per_sample(
-                        open(f"support/{stem}.reads-support.fas"),
-                        open(f"sample{idx}/support/{stem}.reads-support.fas", "w+"), # TODO
+                        open(f"haplotypes/{stem}.reads-support.fas"),
+                        open(f"sample{idx}/haplotypes/{stem}.reads-support.fas", "w+"), # TODO
                         *posterior_and_avg
                     )
 
@@ -680,7 +680,23 @@ def main(args):
             os.rename('snv', 'snv_before_%d' % int(time.time()))
             os.mkdir('snv')
 
-        for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*')+ glob.glob('./cooccurring_mutations.csv'):
+        for snv_file in glob.glob('./SNV*_final*')+ glob.glob('./cooccurring_mutations.csv'):
+            shutil.copy(snv_file, 'snv/')
+        for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*.tsv'):
             shutil.move(snv_file, 'snv/')
 
+    # now move all files that are not directly results into the debug directory
+    shutil.move("inference", "work")
+    shutil.move("raw_reads", "work")
+    shutil.move("sampling", "work")
+    shutil.move("debug", "work")
+    shutil.move("freq", "work")
+    shutil.move("corrected", "work")
+    shutil.move("reads.fas", "work")
+    shutil.move("proposed.dat", "work")
+    shutil.move("snv", "work")
+    shutil.move(glob.glob('*.cor.fas')[0], "work")
+
     logging.info('shotgun run ends')
+    logging.info('VILOCA terminated')
+    print("VILOCA terminated successfully.")