From b98a925c9de1329a3fdcd15134f00b4a76e408bf Mon Sep 17 00:00:00 2001 From: MarvinDo Date: Fri, 23 Aug 2024 15:31:17 +0200 Subject: [PATCH] improved hci_priors download --- data/script/download_hci_priors.sh | 20 ++++++++++++-------- tools/priors_crawler.py | 9 +++++++-- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/data/script/download_hci_priors.sh b/data/script/download_hci_priors.sh index 016e542b..7a5ae553 100755 --- a/data/script/download_hci_priors.sh +++ b/data/script/download_hci_priors.sh @@ -79,8 +79,11 @@ cd HCI_priors python3 $dbconverter -g BRCA1 -e exon2 --header > priors_hg19.vcf python3 $dbconverter -g BRCA2 -e exon2 >> priors_hg19.vcf -$ngsbits/VcfCheck -in priors_hg19.vcf -ref $grch37 -lines 0 > vcferrors_hg19.txt +$ngsbits/VcfLeftNormalize -in priors_hg19.vcf -ref $grch37 -out priors_hg19.normalized.vcf +mv priors_hg19.normalized.vcf priors_hg19.vcf $ngsbits/VcfSort -in priors_hg19.vcf -out priors_hg19.vcf + +$ngsbits/VcfCheck -in priors_hg19.vcf -ref $grch37 -lines 0 > vcferrors_hg19.txt bgzip -f -c priors_hg19.vcf > priors_hg19.vcf.gz tabix -p vcf priors_hg19.vcf.gz @@ -91,26 +94,27 @@ rm priors_hg19.vcf.gz rm priors_hg19.vcf.gz.tbi +# these are already availab ein grch38 positions python3 $dbconverter -g MLH1 -e exon1 >> priors.vcf - -##### STILL MISSING: -python3 $dbconverter -g MSH2 -e exon1 > priors_msh2.vcf +python3 $dbconverter -g MSH2 -e exon1 >> priors.vcf python3 $dbconverter -g MSH6 -e exon1 >> priors.vcf -$ngsbits/VcfSort -in priors.vcf -out priors.vcf -cat priors.vcf | $ngsbits/VcfLeftNormalize -stream -ref $grch38 | $ngsbits/VcfStreamSort > priors.normalized.vcf -rm priors.vcf +$ngsbits/VcfLeftNormalize -in priors.vcf -ref $grch38 -out priors.normalized.vcf mv priors.normalized.vcf priors.vcf +$ngsbits/VcfSort -in priors.vcf -out priors.vcf + +$ngsbits/VcfCheck -in priors.vcf -ref $grch38 > vcferrors.txt bgzip -f -c priors.vcf > priors.vcf.gz tabix -p vcf priors.vcf.gz -$ngsbits/VcfCheck -in priors.vcf.gz -ref $grch38 > vcferrors.txt +#python3 /mnt/storage2/users/ahdoebm1/MaxEntScanStats/src/vcf_check_duplicates/main.py +#python3 /mnt/storage2/users/ahdoebm1/MaxEntScanStats/src/vcf_check_duplicates/merge_positions.py diff --git a/tools/priors_crawler.py b/tools/priors_crawler.py index ee5b77fe..26bcc271 100755 --- a/tools/priors_crawler.py +++ b/tools/priors_crawler.py @@ -91,7 +91,7 @@ def retry_parse_html(url): base_url = "https://priors.hci.utah.edu/PRIORS/BRCA/" -exon_url = urljoin(base_url, ("viewer.php?gene=%s&exon=%s" % (gene, first_exon))) +first_exon_url = urljoin(base_url, ("viewer.php?gene=%s&exon=%s" % (gene, first_exon))) if include_header: @@ -128,7 +128,7 @@ def retry_parse_html(url): all_exon_urls = [] -doc = retry_parse_html(exon_url) +doc = retry_parse_html(first_exon_url) for tr in doc.iter('tr'): text_content=tr.text_content() if text_content.startswith('EXON'): @@ -137,8 +137,13 @@ def retry_parse_html(url): all_exon_urls.append(new_exon_url) +consider = False for exon_url in all_exon_urls: functions.eprint(exon_url) + if first_exon_url in exon_url: + consider = True + if not consider: + continue doc = retry_parse_html(exon_url) #seq_container = doc.xpath("//td[@class='seqarea']")[0] for variant_url_container in doc.xpath("//a[@class='seq']"):