From a8b2bac59a9ed029ee858ab6029db8d23393396f Mon Sep 17 00:00:00 2001 From: "Stromberg, Michael" Date: Wed, 2 Jun 2021 16:52:01 -0700 Subject: [PATCH] Feature/rebase 3.6.2 (#592) * Disabled left-alignment of variants in legacy VID mode. (#505) * Disabled left-alignment of variants in legacy VID mode. * Added some unit tests to check the left alignment behavior between normal and legacy VID mode. * Updated the ReSharper settings to properly set var behavior. * Fixed the LOH VID issue in CNVs (#503) * Fixed the parsing issue when encountering reference sequences with colons. (#494) * Updating the version * Updated Nirvana to avoid using the bidirectional trimmer on translocations when using the legacy VID option. (#507) * Added HLA reference parsing to the legacy variant ID parser. (#508) * Reverted some files that had their permissions changed * Revert "Reverted some files that had their permissions changed" This reverts commit d9e3ffea6c34829c9db372fbe8014b62ed412694. * Minor cleanup with file permissions * Minor ReSharper changes * Removed unnecessary comments in LegacyVariantId --- UnitTests/Downloader/ConfigurationTests.cs | 0 .../VariantCreator/LegacyVariantIdTests.cs | 98 +++-- .../Vcf/VariantCreator/VariantFactoryTests.cs | 378 +++++++++++++++++- .../VariantFactoryTestsWithLegacyVids.cs | 320 +++++++++++++++ .../IVariantIdCreator.cs | 2 + Vcf/VariantCreator/LegacyVariantId.cs | 27 +- Vcf/VariantCreator/VariantFactory.cs | 6 +- Vcf/VariantCreator/VariantId.cs | 4 + 8 files changed, 772 insertions(+), 63 deletions(-) mode change 100644 => 100755 UnitTests/Downloader/ConfigurationTests.cs create mode 100644 UnitTests/Vcf/VariantCreator/VariantFactoryTestsWithLegacyVids.cs diff --git a/UnitTests/Downloader/ConfigurationTests.cs b/UnitTests/Downloader/ConfigurationTests.cs old mode 100644 new mode 100755 diff --git a/UnitTests/Vcf/VariantCreator/LegacyVariantIdTests.cs b/UnitTests/Vcf/VariantCreator/LegacyVariantIdTests.cs index ffb97e80..48d73498 100644 --- a/UnitTests/Vcf/VariantCreator/LegacyVariantIdTests.cs +++ b/UnitTests/Vcf/VariantCreator/LegacyVariantIdTests.cs @@ -9,69 +9,89 @@ namespace UnitTests.Vcf.VariantCreator { public sealed class LegacyVariantIdTests { - private readonly LegacyVariantId _vidCreator = new LegacyVariantId(ChromosomeUtilities.RefNameToChromosome); + private readonly LegacyVariantId _vidCreator = new(ChromosomeUtilities.RefNameToChromosome); [Theory] - [InlineData(66507, 66507, "T", "A", "1:66507:A")] - [InlineData(66522, 66521, "", "ATATA", "1:66522:66521:ATATA")] - [InlineData(66573, 66574, "TA", "", "1:66573:66574")] - [InlineData(66573, 66572, "", "TACTATATATTA", "1:66573:66572:TACTATATATTA")] - [InlineData(100, 104, "TAGGT", "ACTTA", "1:100:104:ACTTA")] - [InlineData(100, 104, "TAGGT", "", "1:100:104")] - [InlineData(101, 100, "", "CGA", "1:101:100:CGA")] - [InlineData(100, 100, "T", "A", "1:100:A")] - [InlineData(100, 104, "TAGGT", "CGA", "1:100:104:CGA")] - [InlineData(100, 99, "", "ACTGACGTACGAAGTTGCCGTACGTACTTGTCC", "1:100:99:3bd631d37e62d5db0f6d5d6db3cdcb60")] - [InlineData(66366, 66378, "ATATAATATATAA", "TATATATATTATTATATAATATAATATATATTATATAATATATTTTATTATATAATATAATATATATTATATAATATAATATATTTTATTATATAAATATATATTATATTATATAATATAATATATATTAATATAAATATATATTAT", "1:66366:66378:17b72647da13e3c186348467b29b0492")] + [InlineData(66507, 66507, "T", "A", "1:66507:A")] + [InlineData(66522, 66521, "", "ATATA", "1:66522:66521:ATATA")] + [InlineData(66573, 66574, "TA", "", "1:66573:66574")] + [InlineData(66573, 66572, "", "TACTATATATTA", "1:66573:66572:TACTATATATTA")] + [InlineData(100, 104, "TAGGT", "ACTTA", "1:100:104:ACTTA")] + [InlineData(100, 104, "TAGGT", "", "1:100:104")] + [InlineData(101, 100, "", "CGA", "1:101:100:CGA")] + [InlineData(100, 100, "T", "A", "1:100:A")] + [InlineData(100, 104, "TAGGT", "CGA", "1:100:104:CGA")] + [InlineData(100, 99, "", "ACTGACGTACGAAGTTGCCGTACGTACTTGTCC", "1:100:99:3bd631d37e62d5db0f6d5d6db3cdcb60")] + [InlineData(66366, 66378, "ATATAATATATAA", + "TATATATATTATTATATAATATAATATATATTATATAATATATTTTATTATATAATATAATATATATTATATAATATAATATATTTTATTATATAAATATATATTATATTATATAATATAATATATATTAATATAAATATATATTAT", + "1:66366:66378:17b72647da13e3c186348467b29b0492")] [InlineData(100, 300, "", "", "1:100:*")] public void Create_SmallVariants_ReturnVid(int start, int end, string refAllele, string altAllele, string expectedVid) { - string observedVid = _vidCreator.Create(null, VariantCategory.SmallVariant, null, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, null); + string observedVid = _vidCreator.Create(null, VariantCategory.SmallVariant, null, ChromosomeUtilities.Chr1, start, end, refAllele, + altAllele, null); Assert.Equal(expectedVid, observedVid); } [Theory] [InlineData(66507, 66507, "T", ".", "1:66507:66507:T")] - [InlineData(100, 100, "T", "T", "1:100:100:T")] - [InlineData(100, 100, "T", ".", "1:100:100:T")] + [InlineData(100, 100, "T", "T", "1:100:100:T")] + [InlineData(100, 100, "T", ".", "1:100:100:T")] public void Create_Reference_ReturnVid(int start, int end, string refAllele, string altAllele, string expectedVid) { - string observedVid = _vidCreator.Create(null, VariantCategory.Reference, null, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, null); + string observedVid = _vidCreator.Create(null, VariantCategory.Reference, null, ChromosomeUtilities.Chr1, start, end, refAllele, altAllele, + null); Assert.Equal(expectedVid, observedVid); } - [Fact] - public void Create_TranslocationBreakend_ReturnVid() + [Theory] + [InlineData(2617277, "A", "AAAAAAAAAAAAAAAAAATTAGTCAGGCAC[chr3:153444911[", "2:2617277:+:3:153444911:+")] + [InlineData(32973490, "T", "T]chr9:74198768]", "2:32973490:+:9:74198768:-")] + [InlineData(321681, "G", "G[13:123460[", "2:321681:+:13:123460:+")] + [InlineData(32527769, "C", "[HLA-DRB1*13:02:01:3117[C", "2:32527769:-:HLA-DRB1*13:02:01:3117:+")] + public void Create_TranslocationBreakend_ReturnVid(int position, string refAllele, string altAllele, string expectedVid) { - string observedVid = _vidCreator.Create(null, VariantCategory.SV, "BND", ChromosomeUtilities.Chr2, 321681, 321681, "G", "G[13:123460[", null); - Assert.Equal("2:321681:+:13:123460:+", observedVid); + string observedVid = _vidCreator.Create(null, VariantCategory.SV, "BND", ChromosomeUtilities.Chr2, position, position, refAllele, + altAllele, null); + Assert.Equal(expectedVid, observedVid); } [Theory] - [InlineData(1000, 3001000, "", null, "ROH", VariantCategory.ROH, "1:1001:3001000:ROH")] - [InlineData(1350082, 1351320, "", null, "DEL", VariantCategory.SV, "1:1350083:1351320")] - [InlineData(999, 2015, "", null, "DUP", VariantCategory.SV, "1:1000:2015:DUP")] - [InlineData(1477854, 1477984, "", null, "DUP", VariantCategory.SV, "1:1477855:1477984:TDUP")] - [InlineData(1477968, 1477968, "", null, "INS", VariantCategory.SV, "1:1477969:1477968:INS")] - [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CNV")] - [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CN3")] - [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDUP")] - [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDEL")] - [InlineData(2000, 5000, "", null, "ALU", VariantCategory.SV, "1:2001:5000:MEI")] - [InlineData(2000, 5000, "", null, "LINE1", VariantCategory.SV, "1:2001:5000:MEI")] - [InlineData(2000, 5000, "", null, "SVA", VariantCategory.SV, "1:2001:5000:MEI")] - [InlineData(2000, 5000, "", null, "BOB", VariantCategory.SV, "1:2001:5000")] - [InlineData(1715898, 1750149, "", null, "CNV", VariantCategory.CNV, "1:1715899:1750149:CDUP")] - [InlineData(2650426, 2653074, "", null, "CNV", VariantCategory.CNV, "1:2650427:2653074:CDEL")] - [InlineData(321682, 421681, "", null, "INV", VariantCategory.SV, "1:321683:421681:Inverse")] - [InlineData(199, 202, "", "TTG", "", VariantCategory.RepeatExpansion, "1:200:202:TTG:5")] - public void Create_StructuralVariants_ReturnVid(int start, int end, string altAllele, string repeatUnit, - string svType, VariantCategory category, string expectedVid) + [InlineData(1000, 3001000, "", null, "ROH", VariantCategory.ROH, "1:1001:3001000:ROH")] + [InlineData(1350082, 1351320, "", null, "DEL", VariantCategory.SV, "1:1350083:1351320")] + [InlineData(999, 2015, "", null, "DUP", VariantCategory.SV, "1:1000:2015:DUP")] + [InlineData(1477854, 1477984, "", null, "DUP", VariantCategory.SV, "1:1477855:1477984:TDUP")] + [InlineData(1477968, 1477968, "", null, "INS", VariantCategory.SV, "1:1477969:1477968:INS")] + [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CNV")] + [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CN3")] + [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDUP")] + [InlineData(2000, 5000, "", null, "CNV", VariantCategory.CNV, "1:2001:5000:CDEL")] + [InlineData(2000, 5000, "", null, "ALU", VariantCategory.SV, "1:2001:5000:MEI")] + [InlineData(2000, 5000, "", null, "LINE1", VariantCategory.SV, "1:2001:5000:MEI")] + [InlineData(2000, 5000, "", null, "SVA", VariantCategory.SV, "1:2001:5000:MEI")] + [InlineData(2000, 5000, "", null, "BOB", VariantCategory.SV, "1:2001:5000")] + [InlineData(1715898, 1750149, "", null, "CNV", VariantCategory.CNV, "1:1715899:1750149:CDUP")] + [InlineData(2650426, 2653074, "", null, "CNV", VariantCategory.CNV, "1:2650427:2653074:CDEL")] + [InlineData(321682, 421681, "", null, "INV", VariantCategory.SV, "1:321683:421681:Inverse")] + [InlineData(199, 202, "", "TTG", "", VariantCategory.RepeatExpansion, "1:200:202:TTG:5")] + public void Create_StructuralVariants_ReturnVid(int start, int end, string altAllele, string repeatUnit, string svType, + VariantCategory category, string expectedVid) { string observedVid = _vidCreator.Create(null, category, svType, ChromosomeUtilities.Chr1, start, end, "", altAllele, repeatUnit); Assert.Equal(expectedVid, observedVid); } + [Fact] + public void Create_LOH_ReturnsCnvVid() + { + const string altAllele = ""; + const string svType = "LOH"; + VariantCategory variantCategory = VariantFactory.GetVariantCategory(altAllele, svType); + + string observedVid = _vidCreator.Create(null, variantCategory, svType, ChromosomeUtilities.Chr1, 787923, 887923, "N", altAllele, null); + Assert.Equal("1:787924:887923:CNV", observedVid); + } + [Fact] public void GetSmallVariantVid_UnknownVariantType_ThrowsException() { diff --git a/UnitTests/Vcf/VariantCreator/VariantFactoryTests.cs b/UnitTests/Vcf/VariantCreator/VariantFactoryTests.cs index bfe16fc9..7b62a47f 100644 --- a/UnitTests/Vcf/VariantCreator/VariantFactoryTests.cs +++ b/UnitTests/Vcf/VariantCreator/VariantFactoryTests.cs @@ -1,7 +1,15 @@ -using CacheUtils.TranscriptCache; +using System.IO; +using CacheUtils.TranscriptCache; using Genome; +using Moq; +using OptimizedCore; +using UnitTests.TestDataStructures; using UnitTests.TestUtilities; +using VariantAnnotation.Interface.IO; +using VariantAnnotation.Interface.Positions; +using VariantAnnotation.Interface.Providers; using Variants; +using Vcf; using Vcf.Info; using Vcf.VariantCreator; using Xunit; @@ -10,12 +18,59 @@ namespace UnitTests.Vcf.VariantCreator { public sealed class VariantFactoryTests { - private static readonly ISequence Sequence = new NSequence(); - private readonly VariantId _vidCreator = new(); + private static readonly ISequence Sequence = new NSequence(); + + private readonly ISequence _chr12Seq = new SimpleSequence( + "TCCCCATGCTGCTCTTTTTTGCAAACACCAACACAATTTGGGCTCCATTTATAAGGCATCTGCTGCACCAACCCTCTTTCTTGGTGCTTACTGGACCTGCTCAGGGTTAATTTCTAACTCAAAGAACCTAACTTGGAGTAACTCCGTACCACCAGCAAAGCGACTGGCTTTGGGGAATGACATTTACAATGTATCCACTGTTATTTGGTCACCCAGCAAACTGTCATTTTTCAGAAACCAGGGCTGTCTCACAAACTGGCTTTCAATAAGGTGGGTTGCTTAGCAACTGCCAAGGAATTAAGAAGACAGAATAAGGTATCCGCCAGAGATATTTTATGACCAAAATGAGCTGCACTCATGTGTCTGGTTGTGTTCAAGGTAACCAAGTAAGAGATAACACCCGACTATTTTTGCATCATGAGGAAAAATACTTGGCTTCTGCCCAGAAGGGCAATTATCTCAAAGTCTTGGCAGGCCCCATGGTATGAGAAATGGTAACTGATATGGGGGTTAAAAAAAA", + 106499648); + + private readonly VariantId _vidCreator = new(); + private readonly LegacyVariantId _legacyVidCreator = new(null); + private readonly Mock _sequenceMock = new(); + private readonly VariantFactory _variantFactory; + private readonly ISequenceProvider _sequenceProvider; + + public VariantFactoryTests() + { + // GRCh38 + _sequenceMock.Setup(x => x.Substring(1037629, 1)).Returns("G"); + _sequenceMock.Setup(x => x.Substring(787922, 1)).Returns("A"); + _sequenceMock.Setup(x => x.Substring(110541588, 1)).Returns("T"); + _sequenceMock.Setup(x => x.Substring(100955983, 1)).Returns("C"); + _sequenceMock.Setup(x => x.Substring(11071438, 1)).Returns("G"); + _sequenceMock.Setup(x => x.Substring(934063, 1)).Returns("A"); + _sequenceMock.Setup(x => x.Substring(36690135, 1)).Returns("C"); + _sequenceMock.Setup(x => x.Substring(20093, 1)).Returns("T"); + _sequenceMock.Setup(x => x.Substring(15902, 1)).Returns("G"); + + // GRCh37 (for multi-allelic deletion with left alignment) + _sequenceMock.Setup(x => x.Substring(106500157, 1)).Returns("G"); + _sequenceMock.Setup(x => x.Substring(106500158, 1)).Returns("T"); + _sequenceMock.Setup(x => x.Substring(106500159, 1)).Returns("T"); + _sequenceMock.Setup(x => x.Substring(106500159, 2)).Returns("TA"); + _sequenceMock.Setup(x => x.Substring(106499659, 500)).Returns( + "CTCTTTTTTGCAAACACCAACACAATTTGGGCTCCATTTATAAGGCATCTGCTGCACCAACCCTCTTTCTTGGTGCTTACTGGACCTGCTCAGGGTTAATTTCTAACTCAAAGAACCTAACTTGGAGTAACTCCGTACCACCAGCAAAGCGACTGGCTTTGGGGAATGACATTTACAATGTATCCACTGTTATTTGGTCACCCAGCAAACTGTCATTTTTCAGAAACCAGGGCTGTCTCACAAACTGGCTTTCAATAAGGTGGGTTGCTTAGCAACTGCCAAGGAATTAAGAAGACAGAATAAGGTATCCGCCAGAGATATTTTATGACCAAAATGAGCTGCACTCATGTGTCTGGTTGTGTTCAAGGTAACCAAGTAAGAGATAACACCCGACTATTTTTGCATCATGAGGAAAAATACTTGGCTTCTGCCCAGAAGGGCAATTATCTCAAAGTCTTGGCAGGCCCCATGGTATGAGAAATGGTAACTGATATGGGGGT"); + + _sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, _sequenceMock.Object, ChromosomeUtilities.RefNameToChromosome); + _variantFactory = new VariantFactory(_sequenceMock.Object, _vidCreator); + } + + private IPosition ParseVcfLine(string vcfLine) + { + string[] vcfFields = vcfLine.OptimizedSplit('\t'); + IChromosome chromosome = ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, vcfFields[VcfCommon.ChromIndex]); + + (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); + if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); + + var simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); + + return Position.ToPosition(simplePosition, null, _sequenceProvider, null, _variantFactory); + } // chr1 69391 . A . . SVTYPE=DEL;END=138730 . . [Fact] - public void GetVariant_svDel() + public void CreateVariants_svDel() { var builder = new InfoDataBuilder {SvType = "DEL", End = 138730}; InfoData infoData = builder.Create(); @@ -28,7 +83,7 @@ public void GetVariant_svDel() // 1 723707 Canvas:GAIN:1:723708:2581225 N 41 PASS SVTYPE=CNV;END=2581225 RC:BC:CN:MCC . 129:3123:3:2 [Fact] - public void GetVariant_canvas_cnv() + public void CreateVariants_canvas_cnv() { var builder = new InfoDataBuilder {SvType = "CNV", End = 2581225}; InfoData infoData = builder.Create(); @@ -45,7 +100,7 @@ public void GetVariant_canvas_cnv() // chr1 854895 Canvas:COMPLEXCNV:chr1:854896-861879 N , . PASS SVTYPE=CNV;END=861879;CNVLEN=6984;CIPOS=-291,291;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:59.45:12:1:1:.:25.34:PASS:. 0/1:59.45:12:1:1:.:25.34:PASS:. 1/2:165.40:12:3:3:16.80:16.71:PASS:. [Fact] - public void GetVariant_canvas_cnx() + public void CreateVariants_canvas_cnx() { var builder = new InfoDataBuilder {SvType = "CNV", End = 861879, CiPos = new[] {-291, 291}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); @@ -65,7 +120,7 @@ public void GetVariant_canvas_cnx() // chr1 1463185 Canvas:COMPLEXCNV:chr1:1463186-1476229 N , . PASS SVTYPE=CNV;END=1476229;CNVLEN=13044;CIPOS=-415,415;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/0:109.56:15:2:.:.:20.04:PASS:. 1/1:0.00:15:0:.:.:64.59:PASS:. ./2:167.45:15:3:.:.:17.87:PASS:. [Fact] - public void GetVariant_canvas_cnv_dup() + public void CreateVariants_canvas_cnv_dup() { var builder = new InfoDataBuilder {SvType = "CNV", End = 1476229, CiPos = new[] {-415, 415}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); @@ -85,7 +140,7 @@ public void GetVariant_canvas_cnv_dup() // chr1 1463185 . N . PASS SVTYPE=DUP;END=1476229;SVLEN=13044;CIPOS=-415,415;CIEND=-291,291 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/0:109.56:15:2:.:.:20.04:PASS:. 1/1:0.00:15:0:.:.:64.59:PASS:. ./1:167.45:15:3:.:.:17.87:PASS:. [Fact] - public void GetVariant_dup() + public void CreateVariants_dup() { var builder = new InfoDataBuilder {SvType = "DUP", End = 1476229, CiPos = new[] {-415, 415}, CiEnd = new[] {-291, 291}}; InfoData infoData = builder.Create(); @@ -102,7 +157,7 @@ public void GetVariant_dup() // 1 37820921 MantaDUP:TANDEM:5515:0:1:0:0:0 G . MGE10kb END=38404543;SVTYPE=DUP;SVLEN=583622;CIPOS=0,1;CIEND=0,1;HOMLEN=1;HOMSEQ=A;SOMATIC;SOMATICSCORE=63;ColocalizedCanvas PR:SR 39,0:44,0 202,26:192,32 [Fact] - public void GetVariant_tandem_duplication() + public void CreateVariants_tandem_duplication() { var builder = new InfoDataBuilder {SvType = "DUP", End = 38404543, SvLength = 583622, CiPos = new[] {0, 1}, CiEnd = new[] {0, 1}}; InfoData infoData = builder.Create(); @@ -117,7 +172,7 @@ public void GetVariant_tandem_duplication() // 1 4000000 . N . ROHLC SVTYPE=ROH;END=4001000 GT . . 1 [Fact] - public void GetVariant_ROH() + public void CreateVariants_ROH() { var builder = new InfoDataBuilder {SvType = "ROH", End = 4001000}; InfoData infoData = builder.Create(); @@ -129,5 +184,308 @@ public void GetVariant_ROH() Assert.Equal(AnnotationBehavior.RunsOfHomozygosity, variants[0].Behavior); Assert.Equal(VariantType.run_of_homozygosity, variants[0].Type); } + + // chr12 106500158 . GTTA GTA,GT . . . + [Fact] + public void CreateVariants_LegacyVid_DisableLeftAlignment_MultiAllelic_Deletions() + { + InfoData infoData = new InfoDataBuilder().Create(); + var variantFactory = new VariantFactory(_chr12Seq, _legacyVidCreator); + + IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr12, 106500158, 106500161, "GTTA", + new[] {"GTA", "GT"}, infoData, new[] {false, false}, false, null, null); + + Assert.Equal(2, variants.Length); + Assert.Equal("12:106500160:106500160", variants[0].VariantId); + Assert.Equal("12:106500160:106500161", variants[1].VariantId); + } + + // chr12 106500158 . GTTA GTA,GT . . . + [Fact] + public void CreateVariants_NormalVid_EnableLeftAlignment_MultiAllelic_Deletions() + { + InfoData infoData = new InfoDataBuilder().Create(); + var variantFactory = new VariantFactory(_chr12Seq, _vidCreator); + + IVariant[] variants = variantFactory.CreateVariants(ChromosomeUtilities.Chr12, 106500158, 106500161, "GTTA", + new[] {"GTA", "GT"}, infoData, new[] {false, false}, false, null, null); + + Assert.Equal(2, variants.Length); + Assert.Equal("12-106500158-GT-G", variants[0].VariantId); + Assert.Equal("12-106500159-TTA-T", variants[1].VariantId); + } + + [Fact] + public void ToPosition_SNV() + { + IPosition position = ParseVcfLine("chr1 15274 SNV A T . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-15274-A-T", variant.VariantId); + Assert.Equal(VariantType.SNV, variant.Type); + Assert.Equal(15274, variant.Start); + Assert.Equal(15274, variant.End); + } + + [Fact] + public void ToPosition_insertion() + { + IPosition position = ParseVcfLine("chr1 15903 INS G GC . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-15903-G-GC", variant.VariantId); + Assert.Equal(VariantType.insertion, variant.Type); + Assert.Equal(15904, variant.Start); + Assert.Equal(15903, variant.End); + } + + [Fact] + public void ToPosition_deletion() + { + IPosition position = ParseVcfLine("chr1 20094 DEL TAA T . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-20094-TAA-T", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(20095, variant.Start); + Assert.Equal(20096, variant.End); + } + + [Fact] + public void ToPosition_CANVAS_LOH() + { + IPosition position = ParseVcfLine("chr1 787923 CNV_CANVAS_LOH N 40 . SVTYPE=LOH;END=887923 RC:BC:CN:MCC 106.52:12642:2:2"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-787923-887923-A--LOH", variant.VariantId); + Assert.Equal(VariantType.copy_number_variation, variant.Type); + Assert.Equal(787924, variant.Start); + Assert.Equal(887923, variant.End); + } + + [Fact] + public void ToPosition_Manta_SmallDeletion() + { + IPosition position = ParseVcfLine( + "chr1 934064 SV_SNV AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG A . . END=934904;SVTYPE=DEL . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal( + "1-934064-AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG-A", + variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(934065, variant.Start); + Assert.Equal(934904, variant.End); + } + + [Fact] + public void ToPosition_CANVAS_CNnum() + { + IPosition position = ParseVcfLine( + "chr1 1037630 CNV_CN# N . . SVTYPE=CNV;END=1045024 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:60.76:8:1:.:.:22.51:PASS:."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-1037630-1045024-G--CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_variation, variant.Type); + Assert.Equal(1037631, variant.Start); + Assert.Equal(1045024, variant.End); + } + + [Fact] + public void ToPosition_SV_DUP() + { + IPosition position = ParseVcfLine("chr1 1477854 SV_DUP C . . END=1477984;SVTYPE=DUP . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-1477854-1477984-C--DUP", variant.VariantId); + Assert.Equal(VariantType.tandem_duplication, variant.Type); + Assert.Equal(1477855, variant.Start); + Assert.Equal(1477984, variant.End); + } + + [Fact] + public void ToPosition_SV_INS() + { + IPosition position = ParseVcfLine("chr1 1565683 SV_INS G . . END=1565684;SVTYPE=INS . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-1565683-1565684-G--INS", variant.VariantId); + Assert.Equal(VariantType.insertion, variant.Type); + Assert.Equal(1565684, variant.Start); + Assert.Equal(1565684, variant.End); + } + + [Fact] + public void ToPosition_SV_INV() + { + IPosition position = ParseVcfLine("chr1 6558910 SV_INV G . . END=6559723;SVTYPE=INV . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-6558910-6559723-G--INV", variant.VariantId); + Assert.Equal(VariantType.inversion, variant.Type); + Assert.Equal(6558911, variant.Start); + Assert.Equal(6559723, variant.End); + } + + [Fact] + public void ToPosition_SV_Translocation() + { + IPosition position = ParseVcfLine("chr1 9061384 SV_BND C C]chr14:93246833] . . SVTYPE=BND . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-9061384-C-C]chr14:93246833]", variant.VariantId); + Assert.Equal(VariantType.translocation_breakend, variant.Type); + Assert.Equal(9061384, variant.Start); + Assert.Equal(9061384, variant.End); + } + + [Fact] + public void ToPosition_DRAGEN_LOH() + { + IPosition position = ParseVcfLine( + "chr1 11071439 CNV_DRAGEN_LOH N , . . SVTYPE=CNV;END=12859473;REFLEN=1788034 GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS 1/2:2:0:1000:1000:2.03102:0.000203:248.8:0.0001:1493:1137"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1-11071439-12859473-G--CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_loss, variant.Type); + Assert.Equal(11071440, variant.Start); + Assert.Equal(12859473, variant.End); + + variant = variants[1]; + Assert.Equal("1-11071439-12859473-G--CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_gain, variant.Type); + Assert.Equal(11071440, variant.Start); + Assert.Equal(12859473, variant.End); + } + + [Fact] + public void ToPosition_STR() + { + IPosition position = ParseVcfLine( + "chr3 63912684 STR G . PASS END=63912714;REF=10;RL=30;RU=GCA;VARID=ATXN7;REPID=ATXN7 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC 0/1:SPANNING/SPANNING:10/12:10-10/12-12:9/3:8/11:0/0:26.270270"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("3-63912684-63912714-G--STR", variant.VariantId); + Assert.Equal(VariantType.short_tandem_repeat_variation, variant.Type); + Assert.Equal(63912685, variant.Start); + Assert.Equal(63912714, variant.End); + } + + [Fact] + public void ToPosition_indel() + { + IPosition position = ParseVcfLine("chr4 46758265 INDEL GAGGTATAGAG GTT . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("4-46758266-AGGTATAGAG-TT", variant.VariantId); + Assert.Equal(VariantType.indel, variant.Type); + Assert.Equal(46758266, variant.Start); + Assert.Equal(46758275, variant.End); + } + + [Fact] + public void ToPosition_MNV() + { + IPosition position = ParseVcfLine("chr4 67754304 MNV TGA TTT . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("4-67754305-GA-TT", variant.VariantId); + Assert.Equal(VariantType.MNV, variant.Type); + Assert.Equal(67754305, variant.Start); + Assert.Equal(67754306, variant.End); + } + + [Fact] + public void ToPosition_CNV_DUP() + { + IPosition position = ParseVcfLine( + "chr7 100955984 CNV_DUP N 37 PASS SVTYPE=CNV;END=100969873;REFLEN=13889 GT:SM:CN:BC:PE ./1:1.6625:3:12:48,81"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("7-100955984-100969873-C--CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_gain, variant.Type); + Assert.Equal(100955985, variant.Start); + Assert.Equal(100969873, variant.End); + } + + [Fact] + public void ToPosition_CNV_DEL() + { + IPosition position = ParseVcfLine( + "chr7 110541589 CNV_DEL N 27 cnvLength SVTYPE=CNV;END=110548681;REFLEN=7092 GT:SM:CN:BC:PE 0/1:0.443182:1:7:19,17"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("7-110541589-110548681-T--CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_loss, variant.Type); + Assert.Equal(110541590, variant.Start); + Assert.Equal(110548681, variant.End); + } + + [Fact] + public void ToPosition_ROH() + { + IPosition position = ParseVcfLine("chr22 36690136 ROH N . . END=36788158;SVTYPE=ROH . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("22-36690136-36788158-C--ROH", variant.VariantId); + Assert.Equal(VariantType.run_of_homozygosity, variant.Type); + Assert.Equal(36690137, variant.Start); + Assert.Equal(36788158, variant.End); + } + + // this is actually on GRCh37 + [Fact] + public void ToPosition_MultiAllelic_Deletions() + { + IPosition position = ParseVcfLine("chr12 106500158 . GTTA GTA,GT . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("12-106500158-GT-G", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(106500159, variant.Start); + Assert.Equal(106500159, variant.End); + + variant = variants[1]; + Assert.Equal("12-106500159-TTA-T", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(106500160, variant.Start); + Assert.Equal(106500161, variant.End); + } } } \ No newline at end of file diff --git a/UnitTests/Vcf/VariantCreator/VariantFactoryTestsWithLegacyVids.cs b/UnitTests/Vcf/VariantCreator/VariantFactoryTestsWithLegacyVids.cs new file mode 100644 index 00000000..dc9450b0 --- /dev/null +++ b/UnitTests/Vcf/VariantCreator/VariantFactoryTestsWithLegacyVids.cs @@ -0,0 +1,320 @@ +using System.IO; +using Genome; +using Moq; +using OptimizedCore; +using UnitTests.TestDataStructures; +using UnitTests.TestUtilities; +using VariantAnnotation.Interface.IO; +using VariantAnnotation.Interface.Positions; +using VariantAnnotation.Interface.Providers; +using Variants; +using Vcf; +using Vcf.VariantCreator; +using Xunit; + +namespace UnitTests.Vcf.VariantCreator +{ + public sealed class VariantFactoryTestsWithLegacyVids + { + private readonly Mock _sequenceMock = new(); + private readonly ISequenceProvider _sequenceProvider; + private readonly VariantFactory _variantFactory; + + public VariantFactoryTestsWithLegacyVids() + { + _sequenceProvider = new SimpleSequenceProvider(GenomeAssembly.GRCh38, _sequenceMock.Object, ChromosomeUtilities.RefNameToChromosome); + var vidCreator = new LegacyVariantId(ChromosomeUtilities.RefNameToChromosome); + _variantFactory = new VariantFactory(_sequenceMock.Object, vidCreator); + } + + private IPosition ParseVcfLine(string vcfLine) + { + string[] vcfFields = vcfLine.OptimizedSplit('\t'); + IChromosome chromosome = + ReferenceNameUtilities.GetChromosome(ChromosomeUtilities.RefNameToChromosome, vcfFields[VcfCommon.ChromIndex]); + + (int start, bool foundError) = vcfFields[VcfCommon.PosIndex].OptimizedParseInt32(); + if (foundError) throw new InvalidDataException($"Unable to convert the VCF position to an integer: {vcfFields[VcfCommon.PosIndex]}"); + + var simplePosition = SimplePosition.GetSimplePosition(chromosome, start, vcfFields, new NullVcfFilter()); + + return Position.ToPosition(simplePosition, null, _sequenceProvider, null, _variantFactory); + } + + [Fact] + public void ToPosition_SNV() + { + IPosition position = ParseVcfLine("chr1 15274 SNV A T . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:15274:T", variant.VariantId); + Assert.Equal(VariantType.SNV, variant.Type); + Assert.Equal(15274, variant.Start); + Assert.Equal(15274, variant.End); + } + + [Fact] + public void ToPosition_insertion() + { + IPosition position = ParseVcfLine("chr1 15903 INS G GC . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:15904:15903:C", variant.VariantId); + Assert.Equal(VariantType.insertion, variant.Type); + Assert.Equal(15904, variant.Start); + Assert.Equal(15903, variant.End); + } + + [Fact] + public void ToPosition_deletion() + { + IPosition position = ParseVcfLine("chr1 20094 DEL TAA T . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:20095:20096", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(20095, variant.Start); + Assert.Equal(20096, variant.End); + } + + [Fact] + public void ToPosition_CANVAS_LOH() + { + IPosition position = ParseVcfLine("chr1 787923 CNV_CANVAS_LOH N 40 . SVTYPE=LOH;END=887923 RC:BC:CN:MCC 106.52:12642:2:2"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:787924:887923:CNV", variant.VariantId); + Assert.Equal(VariantType.copy_number_variation, variant.Type); + Assert.Equal(787924, variant.Start); + Assert.Equal(887923, variant.End); + } + + [Fact] + public void ToPosition_Manta_SmallDeletion() + { + IPosition position = ParseVcfLine( + "chr1 934064 SV_SNV AGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTCCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGAGGCTGCTCCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGGGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCAGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTCCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCGGGGGAGGCGGCTGCGTTACAGGTGGGCG A . . END=934904;SVTYPE=DEL . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal( + "1:934065:934904", + variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(934065, variant.Start); + Assert.Equal(934904, variant.End); + } + + [Fact] + public void ToPosition_CANVAS_CNnum() + { + IPosition position = + ParseVcfLine("chr1 1037630 CNV_CN# N . . SVTYPE=CNV;END=1045024 GT:RC:BC:CN:MCC:MCCQ:QS:FT:DQ 0/1:60.76:8:1:.:.:22.51:PASS:."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:1037631:1045024:CN0", variant.VariantId); + Assert.Equal(VariantType.copy_number_variation, variant.Type); + Assert.Equal(1037631, variant.Start); + Assert.Equal(1045024, variant.End); + } + + [Fact] + public void ToPosition_SV_DUP() + { + IPosition position = ParseVcfLine("chr1 1477854 SV_DUP C . . END=1477984;SVTYPE=DUP . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:1477855:1477984:TDUP", variant.VariantId); + Assert.Equal(VariantType.tandem_duplication, variant.Type); + Assert.Equal(1477855, variant.Start); + Assert.Equal(1477984, variant.End); + } + + [Fact] + public void ToPosition_SV_INS() + { + IPosition position = ParseVcfLine("chr1 1565683 SV_INS G . . END=1565684;SVTYPE=INS . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:1565684:1565684:INS", variant.VariantId); + Assert.Equal(VariantType.insertion, variant.Type); + Assert.Equal(1565684, variant.Start); + Assert.Equal(1565684, variant.End); + } + + [Fact] + public void ToPosition_SV_INV() + { + IPosition position = ParseVcfLine("chr1 6558910 SV_INV G . . END=6559723;SVTYPE=INV . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:6558911:6559723:Inverse", variant.VariantId); + Assert.Equal(VariantType.inversion, variant.Type); + Assert.Equal(6558911, variant.Start); + Assert.Equal(6559723, variant.End); + } + + [Fact] + public void ToPosition_SV_Translocation() + { + IPosition position = ParseVcfLine("chr1 9061384 SV_BND C C]chr14:93246833] . . SVTYPE=BND . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:9061384:+:14:93246833:-", variant.VariantId); + Assert.Equal(VariantType.translocation_breakend, variant.Type); + Assert.Equal(9061384, variant.Start); + Assert.Equal(9061384, variant.End); + } + + [Fact] + public void ToPosition_DRAGEN_LOH() + { + IPosition position = + ParseVcfLine( + "chr1 11071439 CNV_DRAGEN_LOH N , . . SVTYPE=CNV;END=12859473;REFLEN=1788034 GT:CN:MCN:CNQ:MCNQ:CNF:MCNF:SD:MAF:BC:AS 1/2:2:0:1000:1000:2.03102:0.000203:248.8:0.0001:1493:1137"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("1:11071440:12859473:CDEL", variant.VariantId); + Assert.Equal(VariantType.copy_number_loss, variant.Type); + Assert.Equal(11071440, variant.Start); + Assert.Equal(12859473, variant.End); + + variant = variants[1]; + Assert.Equal("1:11071440:12859473:CDUP", variant.VariantId); + Assert.Equal(VariantType.copy_number_gain, variant.Type); + Assert.Equal(11071440, variant.Start); + Assert.Equal(12859473, variant.End); + } + + [Fact] + public void ToPosition_STR() + { + IPosition position = + ParseVcfLine( + "chr3 63912684 STR G . PASS END=63912714;REF=10;RL=30;RU=GCA;VARID=ATXN7;REPID=ATXN7 GT:SO:REPCN:REPCI:ADSP:ADFL:ADIR:LC 0/1:SPANNING/SPANNING:10/12:10-10/12-12:9/3:8/11:0/0:26.270270"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("3:63912685:63912714:GCA:12", variant.VariantId); + Assert.Equal(VariantType.short_tandem_repeat_variation, variant.Type); + Assert.Equal(63912685, variant.Start); + Assert.Equal(63912714, variant.End); + } + + [Fact] + public void ToPosition_indel() + { + IPosition position = ParseVcfLine("chr4 46758265 INDEL GAGGTATAGAG GTT . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("4:46758266:46758275:TT", variant.VariantId); + Assert.Equal(VariantType.indel, variant.Type); + Assert.Equal(46758266, variant.Start); + Assert.Equal(46758275, variant.End); + } + + [Fact] + public void ToPosition_MNV() + { + IPosition position = ParseVcfLine("chr4 67754304 MNV TGA TTT . . . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("4:67754305:67754306:TT", variant.VariantId); + Assert.Equal(VariantType.MNV, variant.Type); + Assert.Equal(67754305, variant.Start); + Assert.Equal(67754306, variant.End); + } + + [Fact] + public void ToPosition_CNV_DUP() + { + IPosition position = + ParseVcfLine("chr7 100955984 CNV_DUP N 37 PASS SVTYPE=CNV;END=100969873;REFLEN=13889 GT:SM:CN:BC:PE ./1:1.6625:3:12:48,81"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("7:100955985:100969873:CDUP", variant.VariantId); + Assert.Equal(VariantType.copy_number_gain, variant.Type); + Assert.Equal(100955985, variant.Start); + Assert.Equal(100969873, variant.End); + } + + [Fact] + public void ToPosition_CNV_DEL() + { + IPosition position = + ParseVcfLine( + "chr7 110541589 CNV_DEL N 27 cnvLength SVTYPE=CNV;END=110548681;REFLEN=7092 GT:SM:CN:BC:PE 0/1:0.443182:1:7:19,17"); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("7:110541590:110548681:CDEL", variant.VariantId); + Assert.Equal(VariantType.copy_number_loss, variant.Type); + Assert.Equal(110541590, variant.Start); + Assert.Equal(110548681, variant.End); + } + + [Fact] + public void ToPosition_ROH() + { + IPosition position = ParseVcfLine("chr22 36690136 ROH N . . END=36788158;SVTYPE=ROH . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("22:36690137:36788158:ROH", variant.VariantId); + Assert.Equal(VariantType.run_of_homozygosity, variant.Type); + Assert.Equal(36690137, variant.Start); + Assert.Equal(36788158, variant.End); + } + + // this is actually on GRCh37 + [Fact] + public void ToPosition_MultiAllelic_Deletions() + { + IPosition position = ParseVcfLine("chr12 106500158 . GTTA GTA,GT . . ."); + IVariant[] variants = position.Variants; + Assert.NotNull(variants); + + IVariant variant = variants[0]; + Assert.Equal("12:106500160:106500160", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(106500160, variant.Start); + Assert.Equal(106500160, variant.End); + + variant = variants[1]; + Assert.Equal("12:106500160:106500161", variant.VariantId); + Assert.Equal(VariantType.deletion, variant.Type); + Assert.Equal(106500160, variant.Start); + Assert.Equal(106500161, variant.End); + } + } +} \ No newline at end of file diff --git a/VariantAnnotation.Interface/IVariantIdCreator.cs b/VariantAnnotation.Interface/IVariantIdCreator.cs index 1cdfdb14..ba6a39e5 100644 --- a/VariantAnnotation.Interface/IVariantIdCreator.cs +++ b/VariantAnnotation.Interface/IVariantIdCreator.cs @@ -6,5 +6,7 @@ public interface IVariantIdCreator { string Create(ISequence sequence, VariantCategory category, string svType, IChromosome chromosome, int start, int end, string refAllele, string altAllele, string repeatUnit); + + (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, string refAllele, string altAllele); } } \ No newline at end of file diff --git a/Vcf/VariantCreator/LegacyVariantId.cs b/Vcf/VariantCreator/LegacyVariantId.cs index ed5022bf..08ad52f4 100644 --- a/Vcf/VariantCreator/LegacyVariantId.cs +++ b/Vcf/VariantCreator/LegacyVariantId.cs @@ -40,6 +40,12 @@ public string Create(ISequence sequence, VariantCategory category, string svType } } + public (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, string refAllele, string altAllele) + { + if (altAllele.Contains('[') || altAllele.Contains(']')) return (start, refAllele, altAllele); + return BiDirectionalTrimmer.Trim(start, refAllele, altAllele); + } + private static string GetSvVid(IDictionary refNameToChromosome, string svType, IChromosome chromosome, int start, int end, string refAllele, string altAllele) { var variantType = StructuralVariantCreator.GetVariantType(altAllele, svType); @@ -74,21 +80,20 @@ private static string GetSvVid(IDictionary refNameToChromos return $"{chromosome.EnsemblName}:{start + 1}:{end}"; } } - - private static (IChromosome Chromosome2, int Position2, bool IsSuffix1, bool IsSuffix2) ParseBreakendAltAllele(IDictionary refNameToChromosome, string refAllele, string altAllele) + + private static (IChromosome Chromosome2, int Position2, bool IsSuffix1, bool IsSuffix2) ParseBreakendAltAllele( + IDictionary refNameToChromosome, string refAllele, string altAllele) { string referenceName2; - int position2; - bool isSuffix2; - + int position2; + bool isSuffix2; + const string forwardBreakEnd = "["; - // (\w+)([\[\]])([^:]+):(\d+)([\[\]]) - // ([\[\]])([^:]+):(\d+)([\[\]])(\w+) if (altAllele.StartsWith(refAllele)) { - var forwardRegex = new Regex(@"\w+([\[\]])([^:]+):(\d+)([\[\]])", RegexOptions.Compiled); - var match = forwardRegex.Match(altAllele); + var forwardRegex = new Regex(@"\w+([\[\]])(.+):(\d+)([\[\]])", RegexOptions.Compiled); + Match match = forwardRegex.Match(altAllele); if (!match.Success) throw new InvalidDataException( @@ -102,8 +107,8 @@ private static (IChromosome Chromosome2, int Position2, bool IsSuffix1, bool IsS } else { - var reverseRegex = new Regex(@"([\[\]])([^:]+):(\d+)([\[\]])\w+", RegexOptions.Compiled); - var match = reverseRegex.Match(altAllele); + var reverseRegex = new Regex(@"([\[\]])(.+):(\d+)([\[\]])\w+", RegexOptions.Compiled); + Match match = reverseRegex.Match(altAllele); if (!match.Success) throw new InvalidDataException( diff --git a/Vcf/VariantCreator/VariantFactory.cs b/Vcf/VariantCreator/VariantFactory.cs index 585e89a7..6cceac45 100644 --- a/Vcf/VariantCreator/VariantFactory.cs +++ b/Vcf/VariantCreator/VariantFactory.cs @@ -46,7 +46,7 @@ public IVariant[] CreateVariants(IChromosome chromosome, int start, int end, str if (isDecomposed && isRecomposed) throw new InvalidDataException("A variant can't be both decomposed and recomposed"); (int shiftedStart, string shiftedRef, string shiftedAlt) = - VariantUtils.TrimAndLeftAlign(start, refAllele, altAllele, _sequence); + _vidCreator.Normalize(_sequence, start, refAllele, altAllele); if (variantCategory == VariantCategory.SmallVariant || variantCategory == VariantCategory.Reference) end = shiftedStart + shiftedRef.Length - 1; @@ -58,7 +58,7 @@ public IVariant[] CreateVariants(IChromosome chromosome, int start, int end, str return variants.Count == 0 ? null : variants.ToArray(); } - private static VariantCategory GetVariantCategory(string firstAltAllele, string svType) + internal static VariantCategory GetVariantCategory(string firstAltAllele, string svType) { bool isSymbolicAllele = IsSymbolicAllele(firstAltAllele); @@ -66,7 +66,7 @@ private static VariantCategory GetVariantCategory(string firstAltAllele, string if (!isSymbolicAllele) return VariantCategory.SmallVariant; if (firstAltAllele == "") return VariantCategory.ROH; if (firstAltAllele.StartsWith(" altAllele.Contains("[") || altAllele.Contains("]"); diff --git a/Vcf/VariantCreator/VariantId.cs b/Vcf/VariantCreator/VariantId.cs index fcd6e2b2..27aea724 100644 --- a/Vcf/VariantCreator/VariantId.cs +++ b/Vcf/VariantCreator/VariantId.cs @@ -1,5 +1,6 @@ using Genome; using VariantAnnotation.Interface; +using Variants; namespace Vcf.VariantCreator { @@ -36,6 +37,9 @@ public string Create(ISequence sequence, VariantCategory category, string svType return GetLongVid(chromosome.EnsemblName, start, end, refAllele, altAllele, svType); } + public (int Start, string RefAllele, string AltAllele) Normalize(ISequence sequence, int start, + string refAllele, string altAllele) => VariantUtils.TrimAndLeftAlign(start, refAllele, altAllele, sequence); + private static string GetVid(string chromosomeName, int paddedPosition, string paddedRefAllele, string paddedAltAllele) => chromosomeName + '-' + paddedPosition + '-' + paddedRefAllele + '-' + paddedAltAllele;