From 78ec86a92dd7bdd8fffc1a88e416681af1b54234 Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Sat, 27 Jan 2024 14:07:36 -0800 Subject: [PATCH] fix: deprecation of binom_test in scipy 1.12.0 and cryptic error occurring with incorrect VCFs in cyvcf2 0.30.26 (#208) * handle deprecation of binom_test in scipy 1.12.0 https://docs.scipy.org/doc/scipy/release/1.12.0-notes.html#expired-deprecations * reraise cyvcf2 exception with more detailed message * catch ValueErrors in dumpSTR and qcSTR arising from exceptions in cyvcf2 --- trtools/dumpSTR/dumpSTR.py | 7 +++++++ trtools/qcSTR/qcSTR.py | 19 ++++++++++++++++++- trtools/utils/tr_harmonizer.py | 15 ++++++++++++++- trtools/utils/utils.py | 6 +++++- 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/trtools/dumpSTR/dumpSTR.py b/trtools/dumpSTR/dumpSTR.py index c572e6d2..17b9ac21 100644 --- a/trtools/dumpSTR/dumpSTR.py +++ b/trtools/dumpSTR/dumpSTR.py @@ -1201,6 +1201,13 @@ def main(args): return 1 else: raise te + except ValueError as ve: + message = ve.args[0] + if 'properly formatted' in message: + common.WARNING("Could not parse VCF.\n" + message) + return 1 + else: + raise ve if args.verbose: common.MSG("Processing %s:%s"%(record.chrom, record.pos)) record_counter += 1 diff --git a/trtools/qcSTR/qcSTR.py b/trtools/qcSTR/qcSTR.py index f34bacaa..891f1e83 100644 --- a/trtools/qcSTR/qcSTR.py +++ b/trtools/qcSTR/qcSTR.py @@ -501,7 +501,24 @@ def main(args): # read the vcf numrecords = 0 - for trrecord in harmonizer: + while True: + try: + trrecord = next(harmonizer) + except StopIteration: break + except TypeError as te: + message = te.args[0] + if 'missing' in message and 'mandatory' in message: + common.WARNING("Could not parse VCF.\n" + message) + return 1 + else: + raise te + except ValueError as ve: + message = ve.args[0] + if 'properly formatted' in message: + common.WARNING("Could not parse VCF.\n" + message) + return 1 + else: + raise ve if args.numrecords is not None and numrecords >= args.numrecords: break if args.period is not None and len(trrecord.motif) != args.period: continue diff --git a/trtools/utils/tr_harmonizer.py b/trtools/utils/tr_harmonizer.py index 0b0c68a3..03e214b9 100644 --- a/trtools/utils/tr_harmonizer.py +++ b/trtools/utils/tr_harmonizer.py @@ -1544,6 +1544,7 @@ class TRRecordHarmonizer: def __init__(self, vcffile: cyvcf2.VCF, vcftype: Union[str, VcfTypes] = "auto"): self.vcffile = vcffile self.vcftype = InferVCFType(vcffile, vcftype) + self._record_idx = None def MayHaveImpureRepeats(self) -> bool: """ @@ -1619,6 +1620,18 @@ def __iter__(self) -> Iterator[TRRecord]: def __next__(self) -> TRRecord: """Iterate over TRRecord produced from the underlying vcf.""" - return HarmonizeRecord(self.vcftype, next(self.vcffile)) + if self._record_idx is None: + self._record_idx = 1 + self._record_idx += 1 + try: + record = next(self.vcffile) + except StopIteration: + raise + except Exception: + raise ValueError( + "Unable to parse the "+str(self._record_idx)+"th tandem " + "repeat in the provided VCF. Check that it is properly formatted." + ) + return HarmonizeRecord(self.vcftype, record) # TODO check all users of this class for new options diff --git a/trtools/utils/utils.py b/trtools/utils/utils.py index 6cddb784..c1d55955 100644 --- a/trtools/utils/utils.py +++ b/trtools/utils/utils.py @@ -318,7 +318,11 @@ def GetHardyWeinbergBinomialTest(allele_freqs, genotype_counts): if gt[1] not in allele_freqs.keys(): return np.nan if gt[0] == gt[1]: num_hom += genotype_counts[gt] - return scipy.stats.binom_test(num_hom, n=total_samples, p=exp_hom_frac) + try: + return scipy.stats.binom_test(num_hom, n=total_samples, p=exp_hom_frac) + except AttributeError: + # binom_test was deprecated in favor of binomtest in scipy 1.12.0 + return scipy.stats.binomtest(num_hom, n=total_samples, p=exp_hom_frac).pvalue def GetHomopolymerRun(seq): r"""Compute the maximum homopolymer run length in a sequence