diff --git a/src/ga4gh/vrs/normalize.py b/src/ga4gh/vrs/normalize.py index 2f4c8aff..341ebb17 100644 --- a/src/ga4gh/vrs/normalize.py +++ b/src/ga4gh/vrs/normalize.py @@ -78,12 +78,17 @@ def _get_new_allele_location_pos( return val -def _normalize_allele(input_allele, data_proxy): +def _normalize_allele(input_allele, data_proxy, rle_seq_limit=50): """Normalize Allele using "fully-justified" normalization adapted from NCBI's VOCA. Fully-justified normalization expands such ambiguous representation over the entire region of ambiguity, resulting in an unambiguous representation that may be readily compared with other alleles. + :param input_allele: Input VRS Allele object + :param data_proxy: SeqRepo dataproxy + :param rle_seq_limit: If RLE is set as the new state, set the limit for the length + of the `sequence`. To exclude, set to 0. + Does not attempt to normalize Allele's with definite ranges. Will return the `input_allele` """ @@ -125,7 +130,7 @@ def _normalize_allele(input_allele, data_proxy): # Deletion repeat_subunit_len = len_ref_seq else: - repeat_subunit_len = 0 + repeat_subunit_len = len_alt_seq - len_ref_seq new_allele = pydantic_copy(allele) try: @@ -155,10 +160,17 @@ def _normalize_allele(input_allele, data_proxy): # a Location specified by the coordinates of the new ival, a length # specified by the length of the alternate allele, and a repeat subunit # length + sequence = models.SequenceString(new_alleles[1]) + len_sequence = len(sequence.root) + new_allele.state = models.ReferenceLengthExpression( - length=len(new_alleles[1]), + length=len_sequence, repeatSubunitLength=repeat_subunit_len ) + + if rle_seq_limit and len_sequence < rle_seq_limit: + new_allele.state.sequence = sequence + except ValueError: # Occurs for ref agree Alleles (when alt = ref) pass @@ -179,27 +191,26 @@ def _normalize_haplotype(o, data_proxy=None): return o -def _normalize_variationset(o, data_proxy=None): - o.members = sorted(o.members, key=ga4gh_digest) - return o - - handlers = { "Allele": _normalize_allele, "Haplotype": _normalize_haplotype, - "VariationSet": _normalize_variationset, } -def normalize(vo, data_proxy=None): - """normalize given vrs object, regardless of type""" +def normalize(vo, data_proxy=None, **kwargs): + """normalize given vrs object, regardless of type + + kwargs: + rle_seq_limit: If RLE is set as the new state, set the limit for the length + of the `sequence`. To exclude `state.sequence`, set to 0. + """ assert is_pydantic_instance(vo) vo_type = vo.type if vo_type in handlers: handler = handlers[vo_type] - return handler(vo, data_proxy) + return handler(vo, data_proxy, **kwargs) # No handler for vo_type; pass-through unchanged return vo diff --git a/tests/cassettes/test_normalize_allele.yaml b/tests/cassettes/test_normalize_allele.yaml index 920f284f..aced451c 100644 --- a/tests/cassettes/test_normalize_allele.yaml +++ b/tests/cassettes/test_normalize_allele.yaml @@ -34,7 +34,7 @@ interactions: Content-Type: - application/json Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -64,7 +64,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -94,7 +94,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -135,7 +135,7 @@ interactions: Content-Type: - application/json Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -165,7 +165,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -195,7 +195,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -225,7 +225,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -255,7 +255,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -285,7 +285,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -315,7 +315,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:28 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -345,7 +345,7 @@ interactions: Content-Type: - text/plain; charset=utf-8 Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:29 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: @@ -386,7 +386,318 @@ interactions: Content-Type: - application/json Date: - - Thu, 24 Aug 2023 02:16:39 GMT + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/metadata/refseq:NC_000023.11 + response: + body: + string: "{\n \"added\": \"2016-08-27T23:57:18Z\",\n \"aliases\": [\n \"GRCh38:X\",\n + \ \"GRCh38:chrX\",\n \"GRCh38.p1:X\",\n \"GRCh38.p1:chrX\",\n \"GRCh38.p10:X\",\n + \ \"GRCh38.p10:chrX\",\n \"GRCh38.p11:X\",\n \"GRCh38.p11:chrX\",\n + \ \"GRCh38.p12:X\",\n \"GRCh38.p12:chrX\",\n \"GRCh38.p2:X\",\n \"GRCh38.p2:chrX\",\n + \ \"GRCh38.p3:X\",\n \"GRCh38.p3:chrX\",\n \"GRCh38.p4:X\",\n \"GRCh38.p4:chrX\",\n + \ \"GRCh38.p5:X\",\n \"GRCh38.p5:chrX\",\n \"GRCh38.p6:X\",\n \"GRCh38.p6:chrX\",\n + \ \"GRCh38.p7:X\",\n \"GRCh38.p7:chrX\",\n \"GRCh38.p8:X\",\n \"GRCh38.p8:chrX\",\n + \ \"GRCh38.p9:X\",\n \"GRCh38.p9:chrX\",\n \"MD5:2b3a55ff7f58eb308420c8a9b11cac50\",\n + \ \"NCBI:NC_000023.11\",\n \"refseq:NC_000023.11\",\n \"SEGUID:Z9QbQrrPjpjXSMJesDYqC3A43lA\",\n + \ \"SHA1:67d41b42bacf8e98d748c25eb0362a0b7038de50\",\n \"VMC:GS_w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n + \ \"sha512t24u:w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\",\n \"ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP\"\n + \ ],\n \"alphabet\": \"ACGNRSTWY\",\n \"length\": 156040895\n}\n" + headers: + Connection: + - close + Content-Length: + - '978' + Content-Type: + - application/json + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980373&end=155980375 + response: + body: + string: GT + headers: + Connection: + - close + Content-Length: + - '2' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980373&end=155980375 + response: + body: + string: GT + headers: + Connection: + - close + Content-Length: + - '2' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980374&end=155980375 + response: + body: + string: T + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980373&end=155980374 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980372&end=155980373 + response: + body: + string: G + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980375&end=155980376 + response: + body: + string: T + headers: + Connection: + - close + Content-Length: + - '1' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980373&end=155980375 + response: + body: + string: GT + headers: + Connection: + - close + Content-Length: + - '2' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980375&end=155980375 + response: + body: + string: '' + headers: + Connection: + - close + Content-Length: + - '0' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT + Server: + - Werkzeug/2.2.2 Python/3.10.4 + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 + method: GET + uri: http://localhost:5000/seqrepo/1/sequence/refseq:NC_000023.11?start=155980373&end=155980375 + response: + body: + string: GT + headers: + Connection: + - close + Content-Length: + - '2' + Content-Type: + - text/plain; charset=utf-8 + Date: + - Thu, 24 Aug 2023 16:40:29 GMT Server: - Werkzeug/2.2.2 Python/3.10.4 status: diff --git a/tests/test_vrs_normalize.py b/tests/test_vrs_normalize.py index 7bdcaa73..94f946fd 100644 --- a/tests/test_vrs_normalize.py +++ b/tests/test_vrs_normalize.py @@ -67,6 +67,38 @@ } +allele_dict4 = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequence": "refseq:NC_000023.11", + "start": 155980373, + "end": 155980375 + }, + "state": { + "sequence": "GTGT", + "type": "LiteralSequenceExpression" + } +} + + +allele_dict4_normalized = { + "type": "Allele", + "location": { + "type": "SequenceLocation", + "sequence": "refseq:NC_000023.11", + "start": 155980373, + "end": 155980375 + }, + "state": { + "length": 4, + "repeatSubunitLength": 2, + "sequence": "GTGT", + "type": "ReferenceLengthExpression" + } +} + + @pytest.mark.vcr def test_normalize_allele(rest_dataproxy): allele1 = models.Allele(**allele_dict) @@ -74,7 +106,7 @@ def test_normalize_allele(rest_dataproxy): assert allele1 == allele2 allele1 = models.Allele(**allele_dict2) - allele2 = normalize(allele1, rest_dataproxy) + allele2 = normalize(allele1, rest_dataproxy, rle_seq_limit=0) assert allele1 != allele2 assert allele2 == models.Allele(**allele_dict2_normalized) @@ -82,3 +114,7 @@ def test_normalize_allele(rest_dataproxy): allele3 = models.Allele(**allele_dict3) allele3_after_norm = normalize(allele3, rest_dataproxy) assert allele3_after_norm == allele3 + + allele4 = models.Allele(**allele_dict4) + allele4_after_norm = normalize(allele4, rest_dataproxy) + assert allele4_after_norm == models.Allele(**allele_dict4_normalized)