From 89c1dd315a1cb96edb2d3479f7eb36990044a01c Mon Sep 17 00:00:00 2001 From: Arya Massarat <23412689+aryarm@users.noreply.github.com> Date: Wed, 7 Feb 2024 10:10:15 -0800 Subject: [PATCH] refactor!: remove pybedtools as a dependency. This slightly changes the interpretation of BED files. From now on, the end position of each region will not be included. (#184) * refactor dumpSTR filtering code to use pysam instead of pybedtools * remove requirement for pybedtools * docs: warn that end positions of BED regions are exclusive --- .readthedocs_conda_env.yml | 1 - README.rst | 2 -- pyproject.toml | 1 - trtools/dumpSTR/README.rst | 2 +- trtools/dumpSTR/filters.py | 26 ++++++++++++++----- trtools/dumpSTR/tests/test_dumpSTR.py | 2 +- .../dumpSTR_vcfs/locus_filters.loclog.tab | 2 +- .../dumpSTR_vcfs/locus_filters.vcf | 2 +- 8 files changed, 24 insertions(+), 14 deletions(-) diff --git a/.readthedocs_conda_env.yml b/.readthedocs_conda_env.yml index d5dab8cc..44f8cb8d 100644 --- a/.readthedocs_conda_env.yml +++ b/.readthedocs_conda_env.yml @@ -10,7 +10,6 @@ dependencies: - pytest-cov - importlib-metadata - numpy - - pybedtools - matplotlib-base - pysam - scipy diff --git a/README.rst b/README.rst index b0f60a52..02284c2d 100644 --- a/README.rst +++ b/README.rst @@ -72,8 +72,6 @@ checkout the branch you're interested in, and run the following command from the pip install --upgrade pip pip install -e . -Note: required package :code:`pybedtools` requires zlib. If you receive an error about a missing file :code:`zlib.h`, you can install on Ubuntu using :code:`sudo apt-get install zlib1g-dev` or CentOS using :code:`sudo yum install zlib-devel`. - Note: make sure TRTools is not installed in the environment via a different method before installing from source. :code:`which dumpSTR` should return nothing. Note: if you will run or test :code:`simTR`, you will also need to install `ART `_. The simTR tests will only run if the executable :code:`art_illumina` is found on your :code:`PATH`. If it has been installed, :code:`which art_illumina` should return a path. diff --git a/pyproject.toml b/pyproject.toml index db44d1d9..590e4ec1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ dependencies = [ "matplotlib", "numpy", "pandas", - "pybedtools", "pysam", "scikit-learn", "scipy", diff --git a/trtools/dumpSTR/README.rst b/trtools/dumpSTR/README.rst index 18bcee94..f281a132 100644 --- a/trtools/dumpSTR/README.rst +++ b/trtools/dumpSTR/README.rst @@ -58,7 +58,7 @@ where p_i is the frequency of allele i * :code:`--max-locus-het `: Filters loci with high heterozygosity * :code:`--use-length`: Use allele lengths, rather than sequences, to compute heterozygosity and HWE (only relevant for HipSTR, which reports sequence level differences in TR alleles) -* :code:`--filter-regions `: Filter TRs overlapping the specified set of regions. Must be used with :code:`--filter-regions-names`. Can supply a comma-separated list to each to apply multiple region filters. Bed files must be sorted and tabix-indexed. +* :code:`--filter-regions `: Filter TRs overlapping the specified set of regions. Must be used with :code:`--filter-regions-names`. Can supply a comma-separated list to each to apply multiple region filters. Bed files must be sorted and tabix-indexed. Note that regions are 0-based and inclusive of the start position, but exclusive of the end position. * :code:`--filter-regions-names `: Filter names for each BED file specified in :code:`--filter-regions`. * :code:`--filter-hrun`: Filter repeats with long homopolymer runs. Only used for HipSTR VCF files otherwise ignored. Filters pentanucleotides with homopolymer runs of 5bp or longer, or hexanucleotides with homopolymer runs of 6bp or longer. * :code:`--drop-filtered`: Do not output loci that were filtered, or loci with no calls remaining after filtering. diff --git a/trtools/dumpSTR/filters.py b/trtools/dumpSTR/filters.py index 940ec300..390d04e9 100644 --- a/trtools/dumpSTR/filters.py +++ b/trtools/dumpSTR/filters.py @@ -6,7 +6,7 @@ import os import numpy as np -from pybedtools import BedTool +from pysam import TabixFile, asBed import trtools.utils.common as common import trtools.utils.tr_harmonizer as trh @@ -263,7 +263,7 @@ def LoadRegions(self, filename): common.WARNING("Could not find tabix index %s.tbi"%filename) self.pass_checks = False return - self.regions = BedTool(filename) + self.regions = TabixFile(filename, parser=asBed()) def __call__(self, record: trh.TRRecord): interval = "%s:%s-%s"%(record.chrom, record.pos, record.pos + record.ref_allele_length) @@ -272,10 +272,24 @@ def __call__(self, record: trh.TRRecord): interval2 = interval.replace("chr","") else: interval2 = "chr"+interval # Try with and without chr - tb1 = self.regions.tabix_intervals(interval) - if tb1.count() > 0: return self.name - tb2 = self.regions.tabix_intervals(interval2) - if tb2.count() > 0: return self.name + # A ValueError or StopIteration indicates that there weren't any matching + # regions within the given interval + try: + next(self.regions.fetch(region=interval, multiple_iterators=True)) + except ValueError: + pass + except StopIteration: + pass + else: + return self.name + try: + next(self.regions.fetch(region=interval2, multiple_iterators=True)) + except ValueError: + pass + except StopIteration: + pass + else: + return self.name return None def filter_name(self): return self.name diff --git a/trtools/dumpSTR/tests/test_dumpSTR.py b/trtools/dumpSTR/tests/test_dumpSTR.py index 3ae0f364..057a13e0 100644 --- a/trtools/dumpSTR/tests/test_dumpSTR.py +++ b/trtools/dumpSTR/tests/test_dumpSTR.py @@ -653,7 +653,7 @@ def test_beagle_disallowed_call_filters(tmpdir, vcfdir): """ These tests run dumpSTR and compare its output -to output that has been generated by a pervious version of +to output that has been generated by a previous version of dumpSTR and saved in the repo. The results are expected to be identical. diff --git a/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.loclog.tab b/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.loclog.tab index 83dccc08..7fe7f45a 100644 --- a/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.loclog.tab +++ b/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.loclog.tab @@ -5,4 +5,4 @@ FILTER:CALLRATE0.5 473 FILTER:HWE0.5 213 FILTER:HETLOW0.05 8005 FILTER:HETHIGH0.45 584 -FILTER:foo_region 341 +FILTER:foo_region 342 diff --git a/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.vcf b/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.vcf index 49cce81a..fe46eb24 100644 --- a/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.vcf +++ b/trtools/testsupport/sample_vcfs/dumpSTR_vcfs/locus_filters.vcf @@ -439,7 +439,7 @@ chr21 14383027 GANGSTR_437028 TTTTGTTGTTGTTGTTG TGTTGTTGTTGTTGTTG . HETLOW0.05;f chr21 14384512 GANGSTR_437029 CACTCCACTC CACTCCATTC . foo_region INFRAME_PGEOM=0.95;INFRAME_UP=0.01;INFRAME_DOWN=0.01;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;START=14384512;END=14384521;PERIOD=5;NSKIP=0;NFILT=0;BPDIFFS=0;DP=244;DSNP=0;DSTUTTER=0;DFLANKINDEL=5;AN=6;REFAC=4;AC=2;HRUN=2;HET=0.444444;HWEP=0.588477 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|1:0|0:1:0.5:81:0:0:4:53.04|27.96:0|0:69.99:-2.47:74:-0.35:0|57:0|53:PASS 0|0:0|0:1:1:81:0:0:1:40.50|40.50:0|0:22.26:0:.:0:0|62:0|62:PASS 0|1:0|0:1:0.5:82:0:0:0:60.16|21.84:0|0:51.35:-5.14:75:-0.23:0|54:0|54:PASS chr21 14390199 GANGSTR_437030 TGGTGAGTGTGTGTGTGTGT TAGTGAGTGTGTGTGTGTGT . HETLOW0.05;foo_region INFRAME_PGEOM=0.95;INFRAME_UP=0.01;INFRAME_DOWN=0.01;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0.01;OUTFRAME_DOWN=0.01;START=14390205;END=14390218;PERIOD=2;NSKIP=0;NFILT=0;BPDIFFS=0;DP=222;DSNP=0;DSTUTTER=2;DFLANKINDEL=7;AN=6;REFAC=3;AC=3;HRUN=2;HET=0;HWEP=1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|1:0|0:1:0.5:77:0:1:2:48.14|28.86:0|0:66.66:-1.65:63:-0.96:-2|1;0|57:0|53:PASS 0|1:0|0:1:0.5:67:0:1:4:54.58|12.42:0|0:5.56:-8.49:54:-0.65:-2|1;0|50:0|42:PASS 0|1:0|0:1:0.5:78:0:0:1:54.02|23.98:0|0:44.75:-3.64:64:-0.4:0|55:0|53:PASS chr21 14394189 GANGSTR_437031 ACAAAAACAAACAAACAAAC AAAACAAACAAACAAAC . CALLRATE0.5;foo_region;NO_CALLS_REMAINING INFRAME_PGEOM=0.59;INFRAME_UP=0.02;INFRAME_DOWN=0;OUTFRAME_PGEOM=0.69;OUTFRAME_UP=0;OUTFRAME_DOWN=0.01;START=14394193;END=14394208;PERIOD=4;NSKIP=0;NFILT=3;BPDIFFS=-3;DP=0;DSNP=0;DSTUTTER=0;DFLANKINDEL=0;AN=0;REFAC=0;AC=0;HRUN=5;HET=-1;HWEP=-1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL -chr21 14405799 GANGSTR_437032 ACCCACCCACCC ACCCGCCCACCC . HWE0.5;HETHIGH0.45 INFRAME_PGEOM=0.95;INFRAME_UP=0;INFRAME_DOWN=0;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0;OUTFRAME_DOWN=0;START=14405799;END=14405810;PERIOD=4;NSKIP=0;NFILT=0;BPDIFFS=0;DP=342;DSNP=0;DSTUTTER=0;DFLANKINDEL=1;AN=6;REFAC=3;AC=3;HRUN=3;HET=0.5;HWEP=0.25 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|1:0|0:1:0.5:115:0:0:1:59.46|55.54:0|0:179.46:-0.11:104:-0.16:0|87:0|86:PASS 0|1:0|0:1:0.5:109:0:0:0:53.35|55.65:0|0:152.82:-0.04:98:-0.53:0|83:0|81:PASS 0|1:0|0:1:0.5:118:0:0:0:74.97|43.03:0|0:123.77:-2.66:104:0:0|82:0|78:PASS +chr21 14405799 GANGSTR_437032 ACCCACCCACCC ACCCGCCCACCC . HWE0.5;HETHIGH0.45;foo_region INFRAME_PGEOM=0.95;INFRAME_UP=0;INFRAME_DOWN=0;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0;OUTFRAME_DOWN=0;START=14405799;END=14405810;PERIOD=4;NSKIP=0;NFILT=0;BPDIFFS=0;DP=342;DSNP=0;DSTUTTER=0;DFLANKINDEL=1;AN=6;REFAC=3;AC=3;HRUN=3;HET=0.5;HWEP=0.25 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|1:0|0:1:0.5:115:0:0:1:59.46|55.54:0|0:179.46:-0.11:104:-0.16:0|87:0|86:PASS 0|1:0|0:1:0.5:109:0:0:0:53.35|55.65:0|0:152.82:-0.04:98:-0.53:0|83:0|81:PASS 0|1:0|0:1:0.5:118:0:0:0:74.97|43.03:0|0:123.77:-2.66:104:0:0|82:0|78:PASS chr21 14407537 GANGSTR_437033 AACAACAACAACAAC . . HETLOW0.05 INFRAME_PGEOM=1;INFRAME_UP=0;INFRAME_DOWN=0.01;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0;OUTFRAME_DOWN=0;START=14407537;END=14407551;PERIOD=3;NSKIP=0;NFILT=0;DP=384;DSNP=0;DSTUTTER=1;DFLANKINDEL=0;AN=6;REFAC=6;HRUN=2;HET=0;HWEP=1;AC=0 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|0:0|0:1:1:120:0:0:0:60.00|60.00:0|0:.:0:.:0:0|80:0|78:PASS 0|0:0|0:1:1:134:0:1:0:67.00|67.00:0|0:.:0:.:0:-3|1;0|101:-3|1;0|98:PASS 0|0:0|0:1:1:130:0:0:0:65.00|65.00:0|0:.:0:.:0:0|93:0|90:PASS chr21 14410775 GANGSTR_437034 AGGAGGAGGAGGAGG . . HETLOW0.05 INFRAME_PGEOM=0.95;INFRAME_UP=0;INFRAME_DOWN=0;OUTFRAME_PGEOM=0.95;OUTFRAME_UP=0;OUTFRAME_DOWN=0;START=14410775;END=14410789;PERIOD=3;NSKIP=0;NFILT=0;DP=291;DSNP=0;DSTUTTER=0;DFLANKINDEL=1;AN=6;REFAC=6;HRUN=2;HET=0;HWEP=1;AC=0 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER 0|0:0|0:1:1:89:0:0:0:44.50|44.50:0|0:.:0:.:0:-2|1;0|60:0|59:PASS 0|0:0|0:1:1:104:0:0:0:52.00|52.00:0|0:.:0:.:0:0|80:0|76:PASS 0|0:0|0:1:1:98:0:0:1:49.00|49.00:0|0:.:0:.:0:0|66:0|61:PASS chr21 14411793 GANGSTR_437035 AATAAATAAATAAATAAATAAATAAATA AAATAAATAAATAAATAAATAAATAAATA,AAATAAATAAATAAATAAATAAATAAATAAATAAATA . CALLRATE0.5;NO_CALLS_REMAINING INFRAME_PGEOM=0.58;INFRAME_UP=0.05;INFRAME_DOWN=0.01;OUTFRAME_PGEOM=0.27;OUTFRAME_UP=0.04;OUTFRAME_DOWN=0.03;START=14411793;END=14411820;PERIOD=4;NSKIP=0;NFILT=3;BPDIFFS=1,9;DP=0;DSNP=0;DSTUTTER=0;DFLANKINDEL=0;AN=0;REFAC=0;AC=0,0;HRUN=3;HET=-1;HWEP=-1 GT:GB:Q:PQ:DP:DSNP:DSTUTTER:DFLANKINDEL:PDP:PSNP:GLDIFF:AB:DAB:FS:ALLREADS:MALLREADS:FILTER .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL .:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:NOCALL