From 1fae8dff5d2e9bbb371744756089b34083feaa2b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 17:24:38 -0400 Subject: [PATCH] Iterate over polyploid calls (#582) (#584) * initial tests * first step: variant stats to iterate over gts * second step: variant stats logic generalized * change reader logic to remove diploid assumptions * remove old diploid check * initial tests for polyploid reading and writing --------- Co-authored-by: Adam Wenocur Co-authored-by: Jeremy Leipzig --- apis/python/tests/test_tiledbvcf.py | 48 +++++++- libtiledbvcf/src/read/reader.cc | 18 +-- libtiledbvcf/src/stats/allele_count.cc | 10 -- libtiledbvcf/src/stats/variant_stats.cc | 112 ++++++++++-------- libtiledbvcf/test/inputs/polyploid/first.vcf | 13 ++ libtiledbvcf/test/inputs/polyploid/second.vcf | 13 ++ libtiledbvcf/test/inputs/polyploid/third.vcf | 13 ++ libtiledbvcf/test/run-cli-tests.sh | 13 ++ 8 files changed, 171 insertions(+), 69 deletions(-) create mode 100644 libtiledbvcf/test/inputs/polyploid/first.vcf create mode 100644 libtiledbvcf/test/inputs/polyploid/second.vcf create mode 100644 libtiledbvcf/test/inputs/polyploid/third.vcf diff --git a/apis/python/tests/test_tiledbvcf.py b/apis/python/tests/test_tiledbvcf.py index de42af919..0ed1dc0f0 100755 --- a/apis/python/tests/test_tiledbvcf.py +++ b/apis/python/tests/test_tiledbvcf.py @@ -1148,7 +1148,7 @@ def test_ingest_with_stats(tmp_path): if "outputs" in tmp_path_contents: shutil.rmtree(os.path.join(tmp_path, "outputs")) if "stats_test" in tmp_path_contents: - shutil.rmtree(os.path.join(tmp_path, "outputs")) + shutil.rmtree(os.path.join(tmp_path, "stats_test")) tiledbvcf.config_logging("trace") ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="w") ds.create_dataset(enable_variant_stats=True) @@ -1181,6 +1181,52 @@ def test_ingest_with_stats(tmp_path): ) +# Ok to skip is missing bcftools in Windows CI job +@pytest.mark.skipif( + os.environ.get("CI") == "true" + and platform.system() == "Windows" + and shutil.which("bcftools") is None, + reason="no bcftools", +) +def test_ingest_polyploid(tmp_path): + tmp_path_contents = os.listdir(tmp_path) + if "polyploid" in tmp_path_contents: + shutil.rmtree(os.path.join(tmp_path, "polyploid")) + shutil.copytree( + os.path.join(TESTS_INPUT_DIR, "polyploid"), os.path.join(tmp_path, "polyploid") + ) + raw_inputs = glob.glob(os.path.join(tmp_path, "polyploid", "*.vcf")) + print(f"raw inputs: {raw_inputs}") + for vcf_file in raw_inputs: + subprocess.run( + "bcftools view --no-version -Oz -o " + vcf_file + ".gz " + vcf_file, + shell=True, + check=True, + ) + bgzipped_inputs = glob.glob(os.path.join(tmp_path, "polyploid", "*.gz")) + print(f"bgzipped inputs: {bgzipped_inputs}") + for vcf_file in bgzipped_inputs: + assert subprocess.run("bcftools index " + vcf_file, shell=True).returncode == 0 + if "polyploid" in tmp_path_contents: + shutil.rmtree(os.path.join(tmp_path, "polyploid")) + if "outputs" in tmp_path_contents: + shutil.rmtree(os.path.join(tmp_path, "outputs")) + if "polyploid_test" in tmp_path_contents: + shutil.rmtree(os.path.join(tmp_path, "polyploid_test")) + tiledbvcf.config_logging("trace") + ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "polyploid_test"), mode="w") + ds.create_dataset(enable_variant_stats=True) + ds.ingest_samples(bgzipped_inputs) + ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "polyploid_test"), mode="r") + sample_names = [os.path.basename(file).split(".")[0] for file in bgzipped_inputs] + data_frame = ds.read( + samples=sample_names, + attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"], + set_af_filter="<0.8", + ) + print(data_frame) + + def test_ingest_mode_separate(tmp_path): # tiledbvcf.config_logging("debug") # Create the dataset diff --git a/libtiledbvcf/src/read/reader.cc b/libtiledbvcf/src/read/reader.cc index 27415accf..53e955386 100644 --- a/libtiledbvcf/src/read/reader.cc +++ b/libtiledbvcf/src/read/reader.cc @@ -1298,14 +1298,18 @@ bool Reader::process_query_results_v4() { // If the allele is in GT, consider it in the pass computation // TODO: when supporting greater than diploid organisms, expand the // following boolean statement into a loop - if (!is_ref && ((gt.size() > 0 && allele_index == gt[0]) || - (gt.size() > 1 && allele_index == gt[1]))) { - pass = pass || allele_passes; - } else { - LOG_TRACE(" ignore allele {} not in GT", allele_index); + { + bool matches_any_allele = false; + for (unsigned int i = 0; i < gt.size(); i++) { + matches_any_allele = matches_any_allele || allele_index == gt[i]; + } + if (!is_ref && matches_any_allele) { + pass = pass || allele_passes; + } else { + LOG_TRACE(" ignore allele {} not in GT", allele_index); + } + allele_index++; } - allele_index++; - // build vector of IAF values for annotation // add annotation to read_state_.query_results // - build vector of AFs matching the order of the VCF record diff --git a/libtiledbvcf/src/stats/allele_count.cc b/libtiledbvcf/src/stats/allele_count.cc index 369f1c3f3..de2d94599 100644 --- a/libtiledbvcf/src/stats/allele_count.cc +++ b/libtiledbvcf/src/stats/allele_count.cc @@ -426,16 +426,6 @@ void AlleleCount::process( return; } - // Only haploid and diploid are supported - if (ngt > 2) { - LOG_FATAL( - "Ploidy > 2 not supported: sample={} locus={}:{} ploidy={}", - sample_name, - contig, - pos, - ngt); - } - // Skip if homozygous ref or alleles are missing if (ngt == 1) { if (gt0 == 0 || gt0_missing) { diff --git a/libtiledbvcf/src/stats/variant_stats.cc b/libtiledbvcf/src/stats/variant_stats.cc index 7c4971ad8..e57735413 100644 --- a/libtiledbvcf/src/stats/variant_stats.cc +++ b/libtiledbvcf/src/stats/variant_stats.cc @@ -396,43 +396,41 @@ void VariantStats::process( return; } - // Only haploid and diploid are supported - if (ngt > 2) { - LOG_FATAL( - "Ploidy > 2 not supported: sample={} locus={}:{} ploidy={}", - sample_name, - contig, - pos, - ngt); + std::vector gt(ngt); + std::vector gt_missing(ngt); + for (int i = 0; i < ngt; i++) { + gt[i] = bcf_gt_allele(dst_[i]); + gt_missing[i] = bcf_gt_is_missing(dst_[i]); } - - int gt0 = bcf_gt_allele(dst_[0]); - int gt1 = ngt == 2 ? bcf_gt_allele(dst_[1]) : 0; - int gt0_missing = bcf_gt_is_missing(dst_[0]); - int gt1_missing = ngt == 2 ? bcf_gt_is_missing(dst_[1]) : 1; int n_allele = rec->n_allele; // Skip if GT value is not a valid allele - if (gt0 >= n_allele || gt1 >= n_allele) { - LOG_WARN( - "VariantStats: skipping invalid GT value: sample={} locus={}:{} " - "gt={}/{} n_allele={}", - sample_name, - contig, - pos + 1, - gt0, - gt1, - n_allele); - return; + { + bool any_exceeds_nallele = false; + for (int i = 0; i < ngt; i++) { + any_exceeds_nallele = any_exceeds_nallele || gt[i] >= n_allele; + } + if (any_exceeds_nallele) { + LOG_WARN( + "VariantStats: skipping invalid GT value: sample={} locus={}:{} " + "gt={}/{} n_allele={}", + sample_name, + contig, + pos + 1, + gt[0], + gt[1], + n_allele); + return; + } } // Skip if alleles are missing - if (ngt == 1) { - if (gt0_missing) { - return; + if (ngt >= 0) { + bool all_gt_missing = true; + for (int i = 0; i < ngt; i++) { + all_gt_missing = all_gt_missing && gt_missing[i]; } - } else if (ngt == 2) { - if (gt0_missing && gt1_missing) { + if (all_gt_missing) { return; } } else { @@ -445,33 +443,45 @@ void VariantStats::process( // Update called for the REF allele auto ref = rec->d.allele[0]; - // If not missing, update allele count for GT[0] - if (!gt0_missing) { - auto alt = alt_string(ref, rec->d.allele[gt0]); - - if (gt0 == 0) { - values_["ref"][AC] += count_delta_; - } else { - values_[alt][AC] += count_delta_; + // Determine homozygosity, generalized over n genotypes: + bool homozygous = true; + { + int first_gt = 0; + bool first_gt_found = false; + for (int i = 0; i < ngt; i++) { + if (!gt_missing[i]) { + if (first_gt_found) { + homozygous = homozygous && gt[i] == first_gt; + } else { + first_gt_found = true; + first_gt = gt[i]; + } + } else { + homozygous = false; + } } } - // If not missing, update allele count for GT[1] - if (!gt1_missing) { - auto alt = alt_string(ref, rec->d.allele[gt1]); + bool already_added_homozygous = false; + for (int i = 0; i < ngt; i++) { + // If not missing, update allele count for GT[i] + if (!gt_missing[i]) { + auto alt = alt_string(ref, rec->d.allele[gt[i]]); - if (gt1 == 0) { - values_["ref"][AC] += count_delta_; - } else { - values_[alt][AC] += count_delta_; - } - - // Update homozygote count, only diploid genotype calls are counted - if (gt0 == gt1) { - if (gt0 == 0) { - values_["ref"][N_HOM] += count_delta_; + if (gt[i] == 0) { + values_["ref"][AC] += count_delta_; } else { - values_[alt][N_HOM] += count_delta_; + values_[alt][AC] += count_delta_; + } + + // Update homozygote count + if (homozygous && !already_added_homozygous) { + if (gt[i] == 0) { + values_["ref"][N_HOM] += count_delta_; + } else { + values_[alt][N_HOM] += count_delta_; + } + already_added_homozygous = true; } } } diff --git a/libtiledbvcf/test/inputs/polyploid/first.vcf b/libtiledbvcf/test/inputs/polyploid/first.vcf new file mode 100644 index 000000000..ce2ba2dc9 --- /dev/null +++ b/libtiledbvcf/test/inputs/polyploid/first.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first +chr1 1 . T C, 829.77 . ExcessHet=3.0103 GT 0/1 +chr1 2 . G GTTTA,T, 1042.73 . ExcessHet=4.8532 GT 0/1 +chr1 3 . C A,G 10.032 . ExcessHet=2.0134 GT 2/2 +chr1 4 . G GTTTA, 1042.73 . ExcessHet=4.8532 GT 0/0 diff --git a/libtiledbvcf/test/inputs/polyploid/second.vcf b/libtiledbvcf/test/inputs/polyploid/second.vcf new file mode 100644 index 000000000..3ab71374a --- /dev/null +++ b/libtiledbvcf/test/inputs/polyploid/second.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT second +chr1 1 . T C, 829.77 . ExcessHet=3.0103 GT 0/0/1/1 +chr1 2 . G GTTTA,T, 343.73 . ExcessHet=2.8532 GT 0/0/1/1 +chr1 3 . C A,G 10.032 . ExcessHet=8.34 GT 2/2/2/2 +chr1 4 . G GTTTA, 343.73 . ExcessHet=2.8532 GT 0/0/1/1 \ No newline at end of file diff --git a/libtiledbvcf/test/inputs/polyploid/third.vcf b/libtiledbvcf/test/inputs/polyploid/third.vcf new file mode 100644 index 000000000..5ee9972cb --- /dev/null +++ b/libtiledbvcf/test/inputs/polyploid/third.vcf @@ -0,0 +1,13 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT third +chr1 1 . T C, 829.77 . ExcessHet=3.0103 GT 0/0/0/0/0/0/1/1 +chr1 2 . G GTTTA,T, 343.73 . ExcessHet=2.8532 GT 0/0/0/0/0/1/1/1 +chr1 3 . C A,T 10.032 . ExcessHet=8.34 GT 0/0/0/0/1/1/1/1 +chr1 4 . G GTTTA, 343.73 . ExcessHet=2.8532 GT 0/0/0/0/0/0/0/0 \ No newline at end of file diff --git a/libtiledbvcf/test/run-cli-tests.sh b/libtiledbvcf/test/run-cli-tests.sh index 930caeb6f..1420717a3 100755 --- a/libtiledbvcf/test/run-cli-tests.sh +++ b/libtiledbvcf/test/run-cli-tests.sh @@ -476,6 +476,19 @@ test ! -e ${upload_dir}/outputs/eighth.vcf || exit 1 [ $(bcftools view -H ${upload_dir}/outputs/second.vcf | wc -l) == "1" ] || exit 1 +rm -rf ${upload_dir}/outputs + +# test ingesting polyploid calls + +mkdir -p ${upload_dir}/outputs + +cp -R ${input_dir}/polyploid ${upload_dir} +for file in ${upload_dir}/polyploid/*.vcf;do bcftools view --no-version -Oz -o "${file}".gz "${file}"; done +for file in ${upload_dir}/polyploid/*.gz;do bcftools index "${file}"; done +$tilevcf create -u ${upload_dir}/polyploid_test --enable-variant-stats --enable-allele-count --log-level trace +$tilevcf store -u ${upload_dir}/polyploid_test --log-level trace ${upload_dir}/polyploid/*.vcf.gz +$tilevcf export -u ${upload_dir}/polyploid_test -d ${upload_dir}/outputs -Ov --af-filter '< 0.8' --log-level trace + # test consolidate and vacuum # ------------------------------------------------------------------- uri=${upload_dir}/pre_test