diff --git a/src/main/java/picard/util/LiftoverUtils.java b/src/main/java/picard/util/LiftoverUtils.java index 255a59477d..3a85de8212 100644 --- a/src/main/java/picard/util/LiftoverUtils.java +++ b/src/main/java/picard/util/LiftoverUtils.java @@ -63,9 +63,10 @@ public class LiftoverUtils { * @param target The target interval * @param refSeq The reference sequence, which should match the target interval * @param writeOriginalPosition If true, INFO field annotations will be added to store the original position and contig + * @param writeOriginalAlleles If true, an INFO field annotation will be added to store the original alleles in order. This can be useful in the case of more complex liftovers, like reverse complements, left aligned indels or swapped REF/ALT * @return The lifted VariantContext. This will be null if the input VariantContext could not be lifted. */ - public static VariantContext liftVariant(final VariantContext source, final Interval target, final ReferenceSequence refSeq, final boolean writeOriginalPosition){ + public static VariantContext liftVariant(final VariantContext source, final Interval target, final ReferenceSequence refSeq, final boolean writeOriginalPosition, final boolean writeOriginalAlleles){ if (target == null) { return null; } @@ -100,9 +101,26 @@ public static VariantContext liftVariant(final VariantContext source, final Inte builder.attribute(LiftoverVcf.ORIGINAL_START, source.getStart()); } + if (writeOriginalAlleles && !source.getAlleles().equals(builder.getAlleles())) { + builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, allelesToStringList(source.getAlleles())); + } + return builder.make(); } + /** + * This is a utility method that will convert a list of alleles into a list of base strings. Reference status + * is ignored when creating these strings (i.e. 'A', not 'A*'). These strings should be sufficient + * to recreate an Allele using Allele.create() + * @param alleles The list of alleles + * @return A list of strings representing the bases of the input alleles. + */ + protected static List allelesToStringList(final List alleles) { + final List ret = new ArrayList<>(); + alleles.forEach(a -> ret.add(a.isNoCall() ? Allele.NO_CALL_STRING : a.getDisplayString())); + return ret; + } + protected static VariantContextBuilder liftSimpleVariantContext(VariantContext source, Interval target){ if (target == null || source.getReference().length() != target.length()) { return null; diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index a5dfbe760e..9cb5b3b748 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -155,12 +155,22 @@ public class LiftoverVcf extends CommandLineProgram { @Argument(doc = "Write the original contig/position for lifted variants to the INFO field.", optional = true) public boolean WRITE_ORIGINAL_POSITION = false; + // Option on whether or not to write the original alleles of the variant to the INFO field + @Argument(doc = "Write the original alleles for lifted variants to the INFO field. If the alleles are identical, this attribute will be omitted.", optional = true) + public boolean WRITE_ORIGINAL_ALLELES = false; + @Argument(doc = "The minimum percent match required for a variant to be lifted.", optional = true) public double LIFTOVER_MIN_MATCH = 1.0; @Argument(doc = "Allow INFO and FORMAT in the records that are not found in the header", optional = true) public boolean ALLOW_MISSING_FIELDS_IN_HEADER = false; + + @Argument(doc = "If the REF allele of the lifted site does not match the target genome, that variant is normally rejected. " + + "For bi-allelic SNPs, if this is set to true and the ALT allele equals the new REF allele, the REF and ALT alleles will be swapped. This can rescue " + + "some variants; however, do this carefully as some annotations may become invalid, such as any that are alelle-specifc. See also TAGS_TO_REVERSE and TAGS_TO_DROP.", optional = true) + public boolean RECOVER_SWAPPED_REF_ALT = false; + @Argument(doc = "INFO field annotations that behave like an Allele Frequency and should be transformed with x->1-x " + "when swapping reference with variant alleles.", optional = true) public Collection TAGS_TO_REVERSE = new ArrayList<>(LiftoverUtils.DEFAULT_TAGS_TO_REVERSE); @@ -212,6 +222,11 @@ public class LiftoverVcf extends CommandLineProgram { */ public static final String ORIGINAL_START = "OriginalStart"; + /** + * Attribute used to store the list of original alleles (including REF), in the original order prior to liftover. + */ + public static final String ORIGINAL_ALLELES = "OriginalAlleles"; + /** * Attribute used to store the position of the failed variant on the target contig prior to finding out that alleles do not match. */ @@ -222,14 +237,15 @@ public class LiftoverVcf extends CommandLineProgram { */ private static final List ATTRS = CollectionUtil.makeList( new VCFInfoHeaderLine(ORIGINAL_CONTIG, 1, VCFHeaderLineType.String, "The name of the source contig/chromosome prior to liftover."), - new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover.") + new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover."), + new VCFInfoHeaderLine(ORIGINAL_ALLELES, VCFHeaderLineCount.R, VCFHeaderLineType.String, "A list of the original alleles (including REF) of the variant prior to liftover. If the alleles were not changed during liftover, this attribute will be omitted.") ); private VariantContextWriter rejects; private final Log log = Log.getInstance(LiftoverVcf.class); private SortingCollection sorter; - private long failedLiftover = 0, failedAlleleCheck = 0; + private long failedLiftover = 0, failedAlleleCheck = 0, totalTrackedAsSwapRefAlt = 0; private Map rejectsByContig = new TreeMap<>(); private Map liftedByDestContig = new TreeMap<>(); private Map liftedBySourceContig = new TreeMap<>(); @@ -365,7 +381,7 @@ protected int doWork() { } else { refSeq = refSeqs.get(target.getContig()); - final VariantContext liftedVC = LiftoverUtils.liftVariant(ctx, target, refSeq, WRITE_ORIGINAL_POSITION); + final VariantContext liftedVC = LiftoverUtils.liftVariant(ctx, target, refSeq, WRITE_ORIGINAL_POSITION, WRITE_ORIGINAL_ALLELES); // the liftedVC can be null if the liftover fails because of a problem with reverse complementing if (liftedVC == null) { rejectVariant(ctx, FILTER_CANNOT_LIFTOVER_INDEL); @@ -404,6 +420,12 @@ protected int doWork() { log.info("no successfully lifted variants"); } + if (RECOVER_SWAPPED_REF_ALT) { + log.info(totalTrackedAsSwapRefAlt, " variants were lifted by swapping REF/ALT alleles."); + } else { + log.warn(totalTrackedAsSwapRefAlt, " variants with a swapped REF/ALT were identified, but were not recovered. See RECOVER_SWAPPED_REF_ALT and associated caveats."); + } + rejects.close(); in.close(); @@ -440,6 +462,12 @@ private void trackLiftedVariantContig(Map map, String contig) { map.put(contig, ++val); } + private void addAndTrack(final VariantContext toAdd, final VariantContext source) { + trackLiftedVariantContig(liftedBySourceContig, source.getContig()); + trackLiftedVariantContig(liftedByDestContig, toAdd.getContig()); + sorter.add(toAdd); + } + /** * utility function to attempt to add a variant. Checks that the reference allele still matches the reference (which may have changed) * @@ -462,10 +490,14 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re if (!refString.equalsIgnoreCase(allele.getBaseString())) { // consider that the ref and the alt may have been swapped in a simple biallelic SNP - if (vc.isBiallelic() && vc.isSNP() && - refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) { - sorter.add(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP)); - return; + if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) { + if (RECOVER_SWAPPED_REF_ALT) { + totalTrackedAsSwapRefAlt++; + addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source); + return; + } else { + totalTrackedAsSwapRefAlt++; + } } mismatchesReference = true; } @@ -481,9 +513,7 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re failedAlleleCheck++; trackLiftedVariantContig(rejectsByContig, source.getContig()); } else { - trackLiftedVariantContig(liftedBySourceContig, source.getContig()); - trackLiftedVariantContig(liftedByDestContig, vc.getContig()); - sorter.add(vc); + addAndTrack(vc, source); } } } \ No newline at end of file diff --git a/src/test/java/picard/util/LiftoverUtilsTest.java b/src/test/java/picard/util/LiftoverUtilsTest.java index 1783970a6d..b2a5247c1a 100644 --- a/src/test/java/picard/util/LiftoverUtilsTest.java +++ b/src/test/java/picard/util/LiftoverUtilsTest.java @@ -3,15 +3,13 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.variantcontext.VariantContextComparator; +import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import picard.vcf.VcfTestUtils; -import java.lang.reflect.Array; import java.util.*; - -import static org.testng.Assert.*; +import java.util.stream.Collectors; /** * Created by farjoun on 1/16/18. @@ -88,4 +86,35 @@ public Object[][] swapRefAltData() { public void testSwapRefAlt(final VariantContext swapMe, final VariantContext expected) { VcfTestUtils.assertEquals(LiftoverUtils.swapRefAlt(swapMe, annotationsToSwap, annotationsToDrop), expected); } + + @DataProvider + public Iterator allelesToStringData() { + + final List tests = new ArrayList<>(); + tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create("G")), Arrays.asList("A", "G")}); + tests.add(new Object[]{Arrays.asList(Allele.create("A", false), Allele.create("G")), Arrays.asList("A", "G")}); + tests.add(new Object[]{Arrays.asList(Allele.create("T", true), Allele.create("T")), Arrays.asList("T", "T")}); + tests.add(new Object[]{Arrays.asList(Allele.create("*", false), Allele.create("G")), Arrays.asList("*", "G")}); + tests.add(new Object[]{Arrays.asList(Allele.create("ATG", true), Allele.create("G")), Arrays.asList("ATG", "G")}); + tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create("GGGTGT")), Arrays.asList("A", "GGGTGT")}); + tests.add(new Object[]{Arrays.asList(Allele.create("A", false), Allele.create("A")), Arrays.asList("A", "A")}); + + tests.add(new Object[]{Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Arrays.asList(Allele.NO_CALL_STRING, Allele.NO_CALL_STRING)}); + tests.add(new Object[]{Arrays.asList(Allele.SPAN_DEL, Allele.create("A", true)), Arrays.asList(Allele.SPAN_DEL_STRING, "A")}); + tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create(".")), Arrays.asList("A", Allele.NO_CALL_STRING)}); + tests.add(new Object[]{Arrays.asList(Allele.create("TT", true), Allele.create(".")), Arrays.asList("TT", Allele.NO_CALL_STRING)}); + + return tests.iterator(); + } + + @Test(dataProvider = "allelesToStringData") + public void testAllelesToString(final List input, final List output) { + Assert.assertEquals(LiftoverUtils.allelesToStringList(input), output); + + //these should back-convert into the same alleles. + List restoredAlleles = output.stream().map(Allele::create).collect(Collectors.toList()); + for (int i = 0;i indelFlipDataWithOriginalAllele() { + final List tests = new ArrayList<>(); + indelFlipData().forEachRemaining(x -> { + if (x[2] == null) { + tests.add(x); + } else { + VariantContext source = (VariantContext)x[0]; + VariantContextBuilder result_builder = new VariantContextBuilder((VariantContext)x[2]); + result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(source.getAlleles())); + tests.add(new Object[]{x[0], x[1], result_builder.make()}); + } + }); + + return tests.iterator(); + } + + @Test(dataProvider = "indelFlipDataWithOriginalAllele") + public void testFlipIndelWithOriginalAlleles(final VariantContext source, final ReferenceSequence reference, final VariantContext result) { + + final LiftOver liftOver = new LiftOver(CHAIN_FILE); + final Interval originalLocus = new Interval(source.getContig(), source.getStart(), source.getEnd()); + final Interval target = liftOver.liftOver(originalLocus); + if (target != null && !target.isNegativeStrand()) { + throw new RuntimeException("not reversed"); + } + + final VariantContext flipped = LiftoverUtils.liftVariant(source, target, reference, false, true); VcfTestUtils.assertEquals(flipped, result); } @@ -554,7 +590,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r public Iterator snpWithChangedRef() { final ReferenceSequence reference = new ReferenceSequence("chr1", 0, - "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes()); + refString.getBytes()); // 123456789 123456789 123456789 123 final Allele RefT = Allele.create("T", true); @@ -1086,6 +1122,7 @@ public Iterator noCallAndSymbolicData() { builder.start(start).stop(start).alleles(CollectionUtil.makeList(CRef, T, DEL)); result_builder.start(liftedStart).stop(liftedStart).alleles(CollectionUtil.makeList(GRef, A, DEL)); result_builder.attribute(LiftoverVcf.ORIGINAL_START, start); + result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(builder.getAlleles())); result_builder.attribute(LiftoverUtils.REV_COMPED_ALLELES, true); genotypeBuilder.alleles(CollectionUtil.makeList(T, DEL)); @@ -1102,7 +1139,7 @@ public Iterator noCallAndSymbolicData() { builder.start(start).stop(start).alleles(CollectionUtil.makeList(CRef, T)); result_builder.start(liftedStart).stop(liftedStart).alleles(CollectionUtil.makeList(GRef, A)); result_builder.attribute(LiftoverVcf.ORIGINAL_START, start); - + result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(builder.getAlleles())); genotypeBuilder.alleles(CollectionUtil.makeList(T, Allele.NO_CALL)); resultGenotypeBuilder.alleles(CollectionUtil.makeList(A, Allele.NO_CALL)); builder.genotypes(genotypeBuilder.make()); @@ -1120,10 +1157,16 @@ public void testLiftOverNoCallAndSymbolic(final LiftOver liftOver, final Variant Assert.assertEquals(target.isNegativeStrand(), expectReversed); - VariantContext vc = LiftoverUtils.liftVariant(source, target, REFERENCE, true); + VariantContext vc = LiftoverUtils.liftVariant(source, target, REFERENCE, true, true); VcfTestUtils.assertEquals(vc, result); Assert.assertEquals(vc.getAttribute(LiftoverVcf.ORIGINAL_CONTIG), source.getContig()); Assert.assertEquals(vc.getAttribute(LiftoverVcf.ORIGINAL_START), source.getStart()); + Assert.assertTrue(source.getAlleles().equals(result.getAlleles()) != result.hasAttribute(LiftoverVcf.ORIGINAL_ALLELES)); + if (!source.getAlleles().equals(result.getAlleles())) { + List resultAlleles = new ArrayList<>(); + source.getAlleles().forEach(a -> resultAlleles.add(a.getDisplayString())); + Assert.assertEquals(resultAlleles, result.getAttributeAsStringList(LiftoverVcf.ORIGINAL_ALLELES, null)); + } } }