Skip to content

Commit

Permalink
Iterate over polyploid calls (#582) (#584)
Browse files Browse the repository at this point in the history
* initial tests

* first step: variant stats to iterate over gts

* second step: variant stats logic generalized

* change reader logic to remove diploid assumptions

* remove old diploid check

* initial tests for polyploid reading and writing

---------

Co-authored-by: Adam Wenocur <[email protected]>
Co-authored-by: Jeremy Leipzig <[email protected]>
  • Loading branch information
3 people authored Sep 28, 2023
1 parent f6ad8fb commit 1fae8df
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 69 deletions.
48 changes: 47 additions & 1 deletion apis/python/tests/test_tiledbvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,7 +1148,7 @@ def test_ingest_with_stats(tmp_path):
if "outputs" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
if "stats_test" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
shutil.rmtree(os.path.join(tmp_path, "stats_test"))
tiledbvcf.config_logging("trace")
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "stats_test"), mode="w")
ds.create_dataset(enable_variant_stats=True)
Expand Down Expand Up @@ -1181,6 +1181,52 @@ def test_ingest_with_stats(tmp_path):
)


# Ok to skip is missing bcftools in Windows CI job
@pytest.mark.skipif(
os.environ.get("CI") == "true"
and platform.system() == "Windows"
and shutil.which("bcftools") is None,
reason="no bcftools",
)
def test_ingest_polyploid(tmp_path):
tmp_path_contents = os.listdir(tmp_path)
if "polyploid" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "polyploid"))
shutil.copytree(
os.path.join(TESTS_INPUT_DIR, "polyploid"), os.path.join(tmp_path, "polyploid")
)
raw_inputs = glob.glob(os.path.join(tmp_path, "polyploid", "*.vcf"))
print(f"raw inputs: {raw_inputs}")
for vcf_file in raw_inputs:
subprocess.run(
"bcftools view --no-version -Oz -o " + vcf_file + ".gz " + vcf_file,
shell=True,
check=True,
)
bgzipped_inputs = glob.glob(os.path.join(tmp_path, "polyploid", "*.gz"))
print(f"bgzipped inputs: {bgzipped_inputs}")
for vcf_file in bgzipped_inputs:
assert subprocess.run("bcftools index " + vcf_file, shell=True).returncode == 0
if "polyploid" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "polyploid"))
if "outputs" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "outputs"))
if "polyploid_test" in tmp_path_contents:
shutil.rmtree(os.path.join(tmp_path, "polyploid_test"))
tiledbvcf.config_logging("trace")
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "polyploid_test"), mode="w")
ds.create_dataset(enable_variant_stats=True)
ds.ingest_samples(bgzipped_inputs)
ds = tiledbvcf.Dataset(uri=os.path.join(tmp_path, "polyploid_test"), mode="r")
sample_names = [os.path.basename(file).split(".")[0] for file in bgzipped_inputs]
data_frame = ds.read(
samples=sample_names,
attrs=["contig", "pos_start", "id", "qual", "info_TILEDB_IAF", "sample_name"],
set_af_filter="<0.8",
)
print(data_frame)


def test_ingest_mode_separate(tmp_path):
# tiledbvcf.config_logging("debug")
# Create the dataset
Expand Down
18 changes: 11 additions & 7 deletions libtiledbvcf/src/read/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1298,14 +1298,18 @@ bool Reader::process_query_results_v4() {
// If the allele is in GT, consider it in the pass computation
// TODO: when supporting greater than diploid organisms, expand the
// following boolean statement into a loop
if (!is_ref && ((gt.size() > 0 && allele_index == gt[0]) ||
(gt.size() > 1 && allele_index == gt[1]))) {
pass = pass || allele_passes;
} else {
LOG_TRACE(" ignore allele {} not in GT", allele_index);
{
bool matches_any_allele = false;
for (unsigned int i = 0; i < gt.size(); i++) {
matches_any_allele = matches_any_allele || allele_index == gt[i];
}
if (!is_ref && matches_any_allele) {
pass = pass || allele_passes;
} else {
LOG_TRACE(" ignore allele {} not in GT", allele_index);
}
allele_index++;
}
allele_index++;

// build vector of IAF values for annotation
// add annotation to read_state_.query_results
// - build vector of AFs matching the order of the VCF record
Expand Down
10 changes: 0 additions & 10 deletions libtiledbvcf/src/stats/allele_count.cc
Original file line number Diff line number Diff line change
Expand Up @@ -426,16 +426,6 @@ void AlleleCount::process(
return;
}

// Only haploid and diploid are supported
if (ngt > 2) {
LOG_FATAL(
"Ploidy > 2 not supported: sample={} locus={}:{} ploidy={}",
sample_name,
contig,
pos,
ngt);
}

// Skip if homozygous ref or alleles are missing
if (ngt == 1) {
if (gt0 == 0 || gt0_missing) {
Expand Down
112 changes: 61 additions & 51 deletions libtiledbvcf/src/stats/variant_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -396,43 +396,41 @@ void VariantStats::process(
return;
}

// Only haploid and diploid are supported
if (ngt > 2) {
LOG_FATAL(
"Ploidy > 2 not supported: sample={} locus={}:{} ploidy={}",
sample_name,
contig,
pos,
ngt);
std::vector<int> gt(ngt);
std::vector<int> gt_missing(ngt);
for (int i = 0; i < ngt; i++) {
gt[i] = bcf_gt_allele(dst_[i]);
gt_missing[i] = bcf_gt_is_missing(dst_[i]);
}

int gt0 = bcf_gt_allele(dst_[0]);
int gt1 = ngt == 2 ? bcf_gt_allele(dst_[1]) : 0;
int gt0_missing = bcf_gt_is_missing(dst_[0]);
int gt1_missing = ngt == 2 ? bcf_gt_is_missing(dst_[1]) : 1;
int n_allele = rec->n_allele;

// Skip if GT value is not a valid allele
if (gt0 >= n_allele || gt1 >= n_allele) {
LOG_WARN(
"VariantStats: skipping invalid GT value: sample={} locus={}:{} "
"gt={}/{} n_allele={}",
sample_name,
contig,
pos + 1,
gt0,
gt1,
n_allele);
return;
{
bool any_exceeds_nallele = false;
for (int i = 0; i < ngt; i++) {
any_exceeds_nallele = any_exceeds_nallele || gt[i] >= n_allele;
}
if (any_exceeds_nallele) {
LOG_WARN(
"VariantStats: skipping invalid GT value: sample={} locus={}:{} "
"gt={}/{} n_allele={}",
sample_name,
contig,
pos + 1,
gt[0],
gt[1],
n_allele);
return;
}
}

// Skip if alleles are missing
if (ngt == 1) {
if (gt0_missing) {
return;
if (ngt >= 0) {
bool all_gt_missing = true;
for (int i = 0; i < ngt; i++) {
all_gt_missing = all_gt_missing && gt_missing[i];
}
} else if (ngt == 2) {
if (gt0_missing && gt1_missing) {
if (all_gt_missing) {
return;
}
} else {
Expand All @@ -445,33 +443,45 @@ void VariantStats::process(
// Update called for the REF allele
auto ref = rec->d.allele[0];

// If not missing, update allele count for GT[0]
if (!gt0_missing) {
auto alt = alt_string(ref, rec->d.allele[gt0]);

if (gt0 == 0) {
values_["ref"][AC] += count_delta_;
} else {
values_[alt][AC] += count_delta_;
// Determine homozygosity, generalized over n genotypes:
bool homozygous = true;
{
int first_gt = 0;
bool first_gt_found = false;
for (int i = 0; i < ngt; i++) {
if (!gt_missing[i]) {
if (first_gt_found) {
homozygous = homozygous && gt[i] == first_gt;
} else {
first_gt_found = true;
first_gt = gt[i];
}
} else {
homozygous = false;
}
}
}

// If not missing, update allele count for GT[1]
if (!gt1_missing) {
auto alt = alt_string(ref, rec->d.allele[gt1]);
bool already_added_homozygous = false;
for (int i = 0; i < ngt; i++) {
// If not missing, update allele count for GT[i]
if (!gt_missing[i]) {
auto alt = alt_string(ref, rec->d.allele[gt[i]]);

if (gt1 == 0) {
values_["ref"][AC] += count_delta_;
} else {
values_[alt][AC] += count_delta_;
}

// Update homozygote count, only diploid genotype calls are counted
if (gt0 == gt1) {
if (gt0 == 0) {
values_["ref"][N_HOM] += count_delta_;
if (gt[i] == 0) {
values_["ref"][AC] += count_delta_;
} else {
values_[alt][N_HOM] += count_delta_;
values_[alt][AC] += count_delta_;
}

// Update homozygote count
if (homozygous && !already_added_homozygous) {
if (gt[i] == 0) {
values_["ref"][N_HOM] += count_delta_;
} else {
values_[alt][N_HOM] += count_delta_;
}
already_added_homozygous = true;
}
}
}
Expand Down
13 changes: 13 additions & 0 deletions libtiledbvcf/test/inputs/polyploid/first.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##FORMAT=<ID=AB,Number=1,Type=Float,Description="Allele balance for each het genotype">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT first
chr1 1 . T C,<NON_REF> 829.77 . ExcessHet=3.0103 GT 0/1
chr1 2 . G GTTTA,T,<NON_REF> 1042.73 . ExcessHet=4.8532 GT 0/1
chr1 3 . C A,G 10.032 . ExcessHet=2.0134 GT 2/2
chr1 4 . G GTTTA,<NON_REF> 1042.73 . ExcessHet=4.8532 GT 0/0
13 changes: 13 additions & 0 deletions libtiledbvcf/test/inputs/polyploid/second.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##FORMAT=<ID=AB,Number=1,Type=Float,Description="Allele balance for each het genotype">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT second
chr1 1 . T C,<NON_REF> 829.77 . ExcessHet=3.0103 GT 0/0/1/1
chr1 2 . G GTTTA,T,<NON_REF> 343.73 . ExcessHet=2.8532 GT 0/0/1/1
chr1 3 . C A,G 10.032 . ExcessHet=8.34 GT 2/2/2/2
chr1 4 . G GTTTA,<NON_REF> 343.73 . ExcessHet=2.8532 GT 0/0/1/1
13 changes: 13 additions & 0 deletions libtiledbvcf/test/inputs/polyploid/third.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##FORMAT=<ID=AB,Number=1,Type=Float,Description="Allele balance for each het genotype">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=chr1>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT third
chr1 1 . T C,<NON_REF> 829.77 . ExcessHet=3.0103 GT 0/0/0/0/0/0/1/1
chr1 2 . G GTTTA,T,<NON_REF> 343.73 . ExcessHet=2.8532 GT 0/0/0/0/0/1/1/1
chr1 3 . C A,T 10.032 . ExcessHet=8.34 GT 0/0/0/0/1/1/1/1
chr1 4 . G GTTTA,<NON_REF> 343.73 . ExcessHet=2.8532 GT 0/0/0/0/0/0/0/0
13 changes: 13 additions & 0 deletions libtiledbvcf/test/run-cli-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,19 @@ test ! -e ${upload_dir}/outputs/eighth.vcf || exit 1

[ $(bcftools view -H ${upload_dir}/outputs/second.vcf | wc -l) == "1" ] || exit 1

rm -rf ${upload_dir}/outputs

# test ingesting polyploid calls

mkdir -p ${upload_dir}/outputs

cp -R ${input_dir}/polyploid ${upload_dir}
for file in ${upload_dir}/polyploid/*.vcf;do bcftools view --no-version -Oz -o "${file}".gz "${file}"; done
for file in ${upload_dir}/polyploid/*.gz;do bcftools index "${file}"; done
$tilevcf create -u ${upload_dir}/polyploid_test --enable-variant-stats --enable-allele-count --log-level trace
$tilevcf store -u ${upload_dir}/polyploid_test --log-level trace ${upload_dir}/polyploid/*.vcf.gz
$tilevcf export -u ${upload_dir}/polyploid_test -d ${upload_dir}/outputs -Ov --af-filter '< 0.8' --log-level trace

# test consolidate and vacuum
# -------------------------------------------------------------------
uri=${upload_dir}/pre_test
Expand Down

0 comments on commit 1fae8df

Please sign in to comment.