From 152d2434b2d48d79500198c3b30f571be768c25b Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 8 Nov 2023 16:16:28 -0500 Subject: [PATCH] fix: gnomad_re should accept all nucleotide characters for ref/alt (#276) (#277) Co-authored-by: Alex H. Wagner, PhD --- src/ga4gh/vrs/extras/translator.py | 5 +- tests/extras/cassettes/test_from_gnomad.yaml | 155 +++++++++++++++---- tests/extras/test_translator.py | 8 + tests/extras/test_vcf_annotation.py | 2 +- 4 files changed, 141 insertions(+), 29 deletions(-) diff --git a/src/ga4gh/vrs/extras/translator.py b/src/ga4gh/vrs/extras/translator.py index 488a3ecd..e66f83b3 100644 --- a/src/ga4gh/vrs/extras/translator.py +++ b/src/ga4gh/vrs/extras/translator.py @@ -41,7 +41,10 @@ class Translator: """ beacon_re = re.compile(r"(?P[^-]+)\s*:\s*(?P\d+)\s*(?P\w+)\s*>\s*(?P\w+)") - gnomad_re = re.compile(r"(?P[^-]+)-(?P\d+)-(?P[ACGTN]+)-(?P[ACGTN]+|\*|\.)", re.IGNORECASE) + gnomad_re = re.compile( + r"(?P[^-]+)-(?P\d+)-(?P[ACGTURYKMSWBDHVN]+)-(?P[ACGTURYKMSWBDHVN]+)", + re.IGNORECASE + ) hgvs_re = re.compile(r"[^:]+:[cgnpr]\.") spdi_re = re.compile(r"(?P[^:]+):(?P\d+):(?P\w*):(?P\w*)") diff --git a/tests/extras/cassettes/test_from_gnomad.yaml b/tests/extras/cassettes/test_from_gnomad.yaml index 4ef8beeb..ede18f3a 100644 --- a/tests/extras/cassettes/test_from_gnomad.yaml +++ b/tests/extras/cassettes/test_from_gnomad.yaml @@ -9,7 +9,7 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:19?start=44908821&end=44908822 response: @@ -23,7 +23,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -39,33 +39,134 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET - uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:19 + uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:17?start=83129586&end=83129598 response: body: - string: "{\n \"added\": \"2016-08-24T08:19:02Z\",\n \"aliases\": [\n \"Ensembl:19\",\n - \ \"ensembl:19\",\n \"GRCh38:19\",\n \"GRCh38:chr19\",\n \"GRCh38.p1:19\",\n - \ \"GRCh38.p1:chr19\",\n \"GRCh38.p10:19\",\n \"GRCh38.p10:chr19\",\n - \ \"GRCh38.p11:19\",\n \"GRCh38.p11:chr19\",\n \"GRCh38.p12:19\",\n - \ \"GRCh38.p12:chr19\",\n \"GRCh38.p2:19\",\n \"GRCh38.p2:chr19\",\n - \ \"GRCh38.p3:19\",\n \"GRCh38.p3:chr19\",\n \"GRCh38.p4:19\",\n \"GRCh38.p4:chr19\",\n - \ \"GRCh38.p5:19\",\n \"GRCh38.p5:chr19\",\n \"GRCh38.p6:19\",\n \"GRCh38.p6:chr19\",\n - \ \"GRCh38.p7:19\",\n \"GRCh38.p7:chr19\",\n \"GRCh38.p8:19\",\n \"GRCh38.p8:chr19\",\n - \ \"GRCh38.p9:19\",\n \"GRCh38.p9:chr19\",\n \"MD5:b0eba2c7bb5c953d1e06a508b5e487de\",\n - \ \"NCBI:NC_000019.10\",\n \"refseq:NC_000019.10\",\n \"SEGUID:AHxM5/L8jIX08UhBBkKXkiO5rhY\",\n - \ \"SHA1:007c4ce7f2fc8c85f4f148410642979223b9ae16\",\n \"VMC:GS_IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n - \ \"sha512t24u:IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\",\n \"ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl\"\n - \ ],\n \"alphabet\": \"ACGNT\",\n \"length\": 58617616\n}\n" + string: GTTGWCACATGA headers: Connection: - close Content-Length: - - '1035' + - '12' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Tue, 07 Nov 2023 23:05:59 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:17 + response: + body: + string: "{\n \"added\": \"2016-08-27T23:52:54Z\",\n \"aliases\": [\n \"GRCh38:17\",\n + \ \"GRCh38:chr17\",\n \"GRCh38.p1:17\",\n \"GRCh38.p1:chr17\",\n \"GRCh38.p10:17\",\n + \ \"GRCh38.p10:chr17\",\n \"GRCh38.p11:17\",\n \"GRCh38.p11:chr17\",\n + \ \"GRCh38.p12:17\",\n \"GRCh38.p12:chr17\",\n \"GRCh38.p2:17\",\n + \ \"GRCh38.p2:chr17\",\n \"GRCh38.p3:17\",\n \"GRCh38.p3:chr17\",\n + \ \"GRCh38.p4:17\",\n \"GRCh38.p4:chr17\",\n \"GRCh38.p5:17\",\n \"GRCh38.p5:chr17\",\n + \ \"GRCh38.p6:17\",\n \"GRCh38.p6:chr17\",\n \"GRCh38.p7:17\",\n \"GRCh38.p7:chr17\",\n + \ \"GRCh38.p8:17\",\n \"GRCh38.p8:chr17\",\n \"GRCh38.p9:17\",\n \"GRCh38.p9:chr17\",\n + \ \"MD5:f9a0fb01553adb183568e3eb9d8626db\",\n \"NCBI:NC_000017.11\",\n + \ \"refseq:NC_000017.11\",\n \"SEGUID:s2Skupj8o6wdjf0aPrgOipAr67Q\",\n + \ \"SHA1:b364a4ba98fca3ac1d8dfd1a3eb80e8a902bebb4\",\n \"VMC:GS_dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\",\n + \ \"sha512t24u:dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\",\n \"ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7\"\n + \ ],\n \"alphabet\": \"ACGKNRSTWY\",\n \"length\": 83257441\n}\n" + headers: + Connection: + - close + Content-Length: + - '1004' + Content-Type: + - application/json + Date: + - Tue, 07 Nov 2023 23:05:59 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:7?start=1&end=17 + response: + body: + string: NNNNNNNNNNNNNNNN + headers: + Connection: + - close + Content-Length: + - '16' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Tue, 07 Nov 2023 23:05:59 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:7 + response: + body: + string: "{\n \"added\": \"2016-08-27T21:23:35Z\",\n \"aliases\": [\n \"GRCh38:7\",\n + \ \"GRCh38:chr7\",\n \"GRCh38.p1:7\",\n \"GRCh38.p1:chr7\",\n \"GRCh38.p10:7\",\n + \ \"GRCh38.p10:chr7\",\n \"GRCh38.p11:7\",\n \"GRCh38.p11:chr7\",\n + \ \"GRCh38.p12:7\",\n \"GRCh38.p12:chr7\",\n \"GRCh38.p2:7\",\n \"GRCh38.p2:chr7\",\n + \ \"GRCh38.p3:7\",\n \"GRCh38.p3:chr7\",\n \"GRCh38.p4:7\",\n \"GRCh38.p4:chr7\",\n + \ \"GRCh38.p5:7\",\n \"GRCh38.p5:chr7\",\n \"GRCh38.p6:7\",\n \"GRCh38.p6:chr7\",\n + \ \"GRCh38.p7:7\",\n \"GRCh38.p7:chr7\",\n \"GRCh38.p8:7\",\n \"GRCh38.p8:chr7\",\n + \ \"GRCh38.p9:7\",\n \"GRCh38.p9:chr7\",\n \"MD5:cc044cc2256a1141212660fb07b6171e\",\n + \ \"NCBI:NC_000007.14\",\n \"refseq:NC_000007.14\",\n \"SEGUID:4+JjCcBVhPCr8vdIhUKFycPv8bY\",\n + \ \"SHA1:e3e26309c05584f0abf2f748854285c9c3eff1b6\",\n \"VMC:GS_F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n + \ \"sha512t24u:F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\",\n \"ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul\"\n + \ ],\n \"alphabet\": \"ACGNRSTY\",\n \"length\": 159345973\n}\n" + headers: + Connection: + - close + Content-Length: + - '977' Content-Type: - application/json Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -81,7 +182,7 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732 response: @@ -95,7 +196,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -111,7 +212,7 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732 response: @@ -125,7 +226,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -141,7 +242,7 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET uri: http://localhost:5000/seqrepo/1/sequence/GRCh38:13?start=32936731&end=32936732 response: @@ -155,7 +256,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -171,7 +272,7 @@ interactions: Connection: - keep-alive User-Agent: - - python-requests/2.28.1 + - python-requests/2.31.0 method: GET uri: http://localhost:5000/seqrepo/1/metadata/GRCh38:13 response: @@ -197,7 +298,7 @@ interactions: Content-Type: - application/json Date: - - Mon, 16 Jan 2023 16:32:42 GMT + - Tue, 07 Nov 2023 23:05:59 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: diff --git a/tests/extras/test_translator.py b/tests/extras/test_translator.py index facedce1..396f28c0 100644 --- a/tests/extras/test_translator.py +++ b/tests/extras/test_translator.py @@ -106,6 +106,14 @@ def test_from_beacon(tlr): def test_from_gnomad(tlr): assert tlr._from_gnomad(snv_inputs["gnomad"]).as_dict() == snv_output + assert tlr._from_gnomad("17-83129587-GTTGWCACATGA-G") + + # Test valid characters + assert tlr._from_gnomad( + "7-2-ACGTURYKMSWBDHVN-ACGTURYKMSWBDHVN", + require_validation=False + ) + # Invalid input. Ref does not match regex assert not tlr._from_gnomad("13-32936732-helloworld-C") diff --git a/tests/extras/test_vcf_annotation.py b/tests/extras/test_vcf_annotation.py index f74606c8..2c99bc65 100644 --- a/tests/extras/test_vcf_annotation.py +++ b/tests/extras/test_vcf_annotation.py @@ -87,4 +87,4 @@ def test_get_vrs_object_invalid_input(vcf_annotator, caplog): # No ALT vcf_annotator._get_vrs_object("7-140753336-A-.", {}, [], "GRCh38") - assert "ValidationError when translating 7-140753336-A-. from gnomad" in caplog.text + assert "None was returned when translating 7-140753336-A-. from gnomad" in caplog.text