Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revisions of VILOCA manuscript #37

Merged
merged 19 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ data. It is designed to analyse genetically heterogeneous samples. Its tools
are written in different programming languages and provide error correction,
haplotype reconstruction and estimation of the frequency of the different
genetic variants present in a mixed sample.
VILOCA takes an alignment file as input, and subsequently generates mutation calls and local haplotypes.


---

Expand Down Expand Up @@ -53,6 +55,13 @@ There are several parameters available:

`--windowsize`: In case no insert file is provided, the genome is tiled into uniform local regions. `windowsize` determines the length of those local regions. It should be of roughly the length of the reads. This is also the length of the haplotypes that are produced.

### Output
`haplotypes` This directory contains the reconstructed local haplotypes as separate fasta files per local region.

`coverage.txt` List of each local region with start and end positions, and number of reads considered in the region.

`cooccurring_mutations.csv` All mutation calls including the information on which haplotype it occurred on.

## Development/CI with Docker
The following command in the root directory will let you interact with the project locally through Docker.
```bash
Expand Down
4 changes: 2 additions & 2 deletions tests/test_long_deletions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def test_long_deletions():

# Input data
bamfile = "test_aln.cram"
snvsfile = "snv/SNV.tsv"
outfile = "snv/SNVs_0.010000.tsv"
snvsfile = "work/snv/SNV.tsv"
outfile = "work/snv/SNVs_0.010000.tsv"

helper_long_deletions.main(bamfile=bamfile, snvsfile=snvsfile, outfile=outfile)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_pooled_post.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def open_side_effect(name):

def test__ingest_sampler_results_gamma_theta():
with patch("builtins.open", mock_open(read_data="bla\n#gamma = 0.967662\n\nlolz\n#theta = 0.88\neof")):
out = pooled_post._ingest_sampler_results_gamma_theta(open("debug/dbg.dbg"), "shorah")
out = pooled_post._ingest_sampler_results_gamma_theta(open("work/debug/dbg.dbg"), "shorah")

np.testing.assert_almost_equal(out[0][0], -0.032872426234558716)
np.testing.assert_almost_equal(out[0][1], -3.431512269664515)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_shotgun_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import shutil

cwd = "./data_1"
f_path = "raw_reads/w-HXB2-2804-3004.extended-ref.fas"
f_path = "work/raw_reads/w-HXB2-2804-3004.extended-ref.fas"
base_files = []

@pytest.fixture(scope="session", autouse=True)
Expand Down Expand Up @@ -35,7 +35,7 @@ def test_e2e_shorah():

assert filecmp.cmp(
"./data_1/test.csv",
"./data_1/snv/SNVs_0.010000_final.csv",
"./data_1/work/snv/SNVs_0.010000_final.csv",
shallow=False
)

Expand Down
2 changes: 2 additions & 0 deletions viloca/b2w.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,8 @@ def build_windows(alignment_file: str, tiling_strategy: TilingStrategy,
reference_filename=reference_filename,
threads=max_proc #1
)
# print this message since pysam print confusing error message with the functions above.
print("Index file was created successfully.")
#reffile = pysam.FastaFile(reference_filename) --> we need to read it in each child processo
#counter = 0 #--> counter is now coputed initially for all windows
reference_name = tiling_strategy.get_reference_name()
Expand Down
2 changes: 1 addition & 1 deletion viloca/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def main():

parser_shotgun.add_argument("--n_max_haplotypes", metavar='INT', type=int,
required=False, default=100, dest="n_max_haplotypes",
help="Guess of maximal guess of haplotypes.")
help="Guess of maximal guess of haplotypes. If VILOCA returns the maximal number of haplotypes then this number was choosen to little and needs to be increased.")

parser_shotgun.add_argument("--conv_thres", metavar='FLOAT', type=float,
required=False, default=1e-03, dest="conv_thres",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import cavi

logging.basicConfig(
filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
filename="viloca.log", encoding="utf-8", level=logging.INFO
)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from . import cavi

logging.basicConfig(
filename="viloca_inference.log", encoding="utf-8", level=logging.INFO
filename="viloca.log", encoding="utf-8", level=logging.INFO
)


Expand Down
2 changes: 1 addition & 1 deletion viloca/shorah_snv.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def parseWindow(line, extended_window_mode, exclude_non_var_pos_threshold,
_, chrom, beg, end, _ = line.rstrip().split("\t")

file_stem = "w-%s-%s-%s" % (chrom, beg, end)
haplo_filename = os.path.join(working_dir, "support", file_stem + ".reads-support.fas")
haplo_filename = os.path.join(working_dir, "haplotypes", file_stem + ".reads-support.fas")

if extended_window_mode:
ref_name = "extended-ref"
Expand Down
34 changes: 25 additions & 9 deletions viloca/shotgun.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ def main(args):
logging.debug("[shotgun] All processes completed successfully.")

# prepare directories
for sd_name in ['debug', 'sampling', 'freq', 'support',
for sd_name in ['debug', 'haplotypes', 'freq', 'sampling', 'work',
'corrected', 'raw_reads', 'inference']:
try:
os.mkdir(sd_name)
Expand Down Expand Up @@ -555,7 +555,7 @@ def main(args):
move_files_into_dir("debug", glob.glob("./w*dbg"))
move_files_into_dir("sampling", glob.glob("./w*smp"))
move_files_into_dir("corrected", glob.glob("./w*reads-cor.fas"))
move_files_into_dir("support", glob.glob("./w*reads-support.fas"))
move_files_into_dir("haplotypes", glob.glob("./w*reads-support.fas"))
move_files_into_dir("freq", glob.glob("./w*reads-freq.csv"))
move_files_into_dir("sampling", glob.glob("./w*smp"))
raw_reads_files = glob.glob('./w*reads.fas') + glob.glob('./w*ref.fas') + glob.glob('./w*qualities.npy')
Expand Down Expand Up @@ -626,15 +626,15 @@ def main(args):
envp_post.post_process_for_envp(
open(f"raw_reads/{stem}.envp-full-ref.fas"),
open(f"raw_reads/{stem}.envp-ref.fas"),
f"support/{stem}.reads-support.fas",
f"support/{stem}.reads-support.fas" # overwrite
f"haplotypes/{stem}.reads-support.fas",
f"haplotypes/{stem}.reads-support.fas" # overwrite
)

# Pooled
b_list = args.b.copy()
if len(b_list) > 1:
for idx, i in enumerate(b_list):
Path(f"sample{idx}/support").mkdir(parents=True, exist_ok=True)
Path(f"sample{idx}/haplotypes").mkdir(parents=True, exist_ok=True)
Path(f"sample{idx}/corrected").mkdir(parents=True, exist_ok=True)
with open("coverage.txt") as cov:
for line in cov:
Expand All @@ -652,16 +652,16 @@ def main(args):
f"raw_reads/{stem}.ref.fas",
filtered_reads.name,
open(f"debug/{stem}.dbg") if inference_type == "shorah" else open(f"inference/{stem}.reads-all_results.pkl", "rb"),
open(f"support/{stem}.reads-support.fas"),
open(f"haplotypes/{stem}.reads-support.fas"),
filtered_cor_reads_path,
inference_type,
None if inference_type != "use_quality_scores" else f"raw_reads/{stem}.qualities.npy" # TODO untested
)
filtered_reads.close()

pooled_post.write_support_file_per_sample(
open(f"support/{stem}.reads-support.fas"),
open(f"sample{idx}/support/{stem}.reads-support.fas", "w+"), # TODO
open(f"haplotypes/{stem}.reads-support.fas"),
open(f"sample{idx}/haplotypes/{stem}.reads-support.fas", "w+"), # TODO
*posterior_and_avg
)

Expand All @@ -680,7 +680,23 @@ def main(args):
os.rename('snv', 'snv_before_%d' % int(time.time()))
os.mkdir('snv')

for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*')+ glob.glob('./cooccurring_mutations.csv'):
for snv_file in glob.glob('./SNV*_final*')+ glob.glob('./cooccurring_mutations.csv'):
shutil.copy(snv_file, 'snv/')
for snv_file in glob.glob('./raw_snv*') + glob.glob('./SNV*.tsv'):
shutil.move(snv_file, 'snv/')

# now move all files that are not directly results into the debug directory
shutil.move("inference", "work")
shutil.move("raw_reads", "work")
shutil.move("sampling", "work")
shutil.move("debug", "work")
shutil.move("freq", "work")
shutil.move("corrected", "work")
shutil.move("reads.fas", "work")
shutil.move("proposed.dat", "work")
shutil.move("snv", "work")
shutil.move(glob.glob('*.cor.fas')[0], "work")

logging.info('shotgun run ends')
logging.info('VILOCA terminated')
print("VILOCA terminated successfully.")
Loading