Skip to content

Commit

Permalink
Add arguments and reporting around recovering REF/ALT swapped alleles (
Browse files Browse the repository at this point in the history
…#1151)

* Add arguments and reporting around recovering REF/ALT swapped alleles (**Changes default behavior!!!**)
* Add LiftoverUtils.allelesToStringList testing
  • Loading branch information
bbimber authored and Yossi Farjoun committed Apr 13, 2018
1 parent 0b8bf5e commit 359e3fd
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 19 deletions.
20 changes: 19 additions & 1 deletion src/main/java/picard/util/LiftoverUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ public class LiftoverUtils {
* @param target The target interval
* @param refSeq The reference sequence, which should match the target interval
* @param writeOriginalPosition If true, INFO field annotations will be added to store the original position and contig
* @param writeOriginalAlleles If true, an INFO field annotation will be added to store the original alleles in order. This can be useful in the case of more complex liftovers, like reverse complements, left aligned indels or swapped REF/ALT
* @return The lifted VariantContext. This will be null if the input VariantContext could not be lifted.
*/
public static VariantContext liftVariant(final VariantContext source, final Interval target, final ReferenceSequence refSeq, final boolean writeOriginalPosition){
public static VariantContext liftVariant(final VariantContext source, final Interval target, final ReferenceSequence refSeq, final boolean writeOriginalPosition, final boolean writeOriginalAlleles){
if (target == null) {
return null;
}
Expand Down Expand Up @@ -100,9 +101,26 @@ public static VariantContext liftVariant(final VariantContext source, final Inte
builder.attribute(LiftoverVcf.ORIGINAL_START, source.getStart());
}

if (writeOriginalAlleles && !source.getAlleles().equals(builder.getAlleles())) {
builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, allelesToStringList(source.getAlleles()));
}

return builder.make();
}

/**
* This is a utility method that will convert a list of alleles into a list of base strings. Reference status
* is ignored when creating these strings (i.e. 'A', not 'A*'). These strings should be sufficient
* to recreate an Allele using Allele.create()
* @param alleles The list of alleles
* @return A list of strings representing the bases of the input alleles.
*/
protected static List<String> allelesToStringList(final List<Allele> alleles) {
final List<String> ret = new ArrayList<>();
alleles.forEach(a -> ret.add(a.isNoCall() ? Allele.NO_CALL_STRING : a.getDisplayString()));
return ret;
}

protected static VariantContextBuilder liftSimpleVariantContext(VariantContext source, Interval target){
if (target == null || source.getReference().length() != target.length()) {
return null;
Expand Down
50 changes: 40 additions & 10 deletions src/main/java/picard/vcf/LiftoverVcf.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,22 @@ public class LiftoverVcf extends CommandLineProgram {
@Argument(doc = "Write the original contig/position for lifted variants to the INFO field.", optional = true)
public boolean WRITE_ORIGINAL_POSITION = false;

// Option on whether or not to write the original alleles of the variant to the INFO field
@Argument(doc = "Write the original alleles for lifted variants to the INFO field. If the alleles are identical, this attribute will be omitted.", optional = true)
public boolean WRITE_ORIGINAL_ALLELES = false;

@Argument(doc = "The minimum percent match required for a variant to be lifted.", optional = true)
public double LIFTOVER_MIN_MATCH = 1.0;

@Argument(doc = "Allow INFO and FORMAT in the records that are not found in the header", optional = true)
public boolean ALLOW_MISSING_FIELDS_IN_HEADER = false;


@Argument(doc = "If the REF allele of the lifted site does not match the target genome, that variant is normally rejected. " +
"For bi-allelic SNPs, if this is set to true and the ALT allele equals the new REF allele, the REF and ALT alleles will be swapped. This can rescue " +
"some variants; however, do this carefully as some annotations may become invalid, such as any that are alelle-specifc. See also TAGS_TO_REVERSE and TAGS_TO_DROP.", optional = true)
public boolean RECOVER_SWAPPED_REF_ALT = false;

@Argument(doc = "INFO field annotations that behave like an Allele Frequency and should be transformed with x->1-x " +
"when swapping reference with variant alleles.", optional = true)
public Collection<String> TAGS_TO_REVERSE = new ArrayList<>(LiftoverUtils.DEFAULT_TAGS_TO_REVERSE);
Expand Down Expand Up @@ -212,6 +222,11 @@ public class LiftoverVcf extends CommandLineProgram {
*/
public static final String ORIGINAL_START = "OriginalStart";

/**
* Attribute used to store the list of original alleles (including REF), in the original order prior to liftover.
*/
public static final String ORIGINAL_ALLELES = "OriginalAlleles";

/**
* Attribute used to store the position of the failed variant on the target contig prior to finding out that alleles do not match.
*/
Expand All @@ -222,14 +237,15 @@ public class LiftoverVcf extends CommandLineProgram {
*/
private static final List<VCFInfoHeaderLine> ATTRS = CollectionUtil.makeList(
new VCFInfoHeaderLine(ORIGINAL_CONTIG, 1, VCFHeaderLineType.String, "The name of the source contig/chromosome prior to liftover."),
new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover.")
new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover."),
new VCFInfoHeaderLine(ORIGINAL_ALLELES, VCFHeaderLineCount.R, VCFHeaderLineType.String, "A list of the original alleles (including REF) of the variant prior to liftover. If the alleles were not changed during liftover, this attribute will be omitted.")
);

private VariantContextWriter rejects;
private final Log log = Log.getInstance(LiftoverVcf.class);
private SortingCollection<VariantContext> sorter;

private long failedLiftover = 0, failedAlleleCheck = 0;
private long failedLiftover = 0, failedAlleleCheck = 0, totalTrackedAsSwapRefAlt = 0;
private Map<String, Long> rejectsByContig = new TreeMap<>();
private Map<String, Long> liftedByDestContig = new TreeMap<>();
private Map<String, Long> liftedBySourceContig = new TreeMap<>();
Expand Down Expand Up @@ -365,7 +381,7 @@ protected int doWork() {
} else {
refSeq = refSeqs.get(target.getContig());

final VariantContext liftedVC = LiftoverUtils.liftVariant(ctx, target, refSeq, WRITE_ORIGINAL_POSITION);
final VariantContext liftedVC = LiftoverUtils.liftVariant(ctx, target, refSeq, WRITE_ORIGINAL_POSITION, WRITE_ORIGINAL_ALLELES);
// the liftedVC can be null if the liftover fails because of a problem with reverse complementing
if (liftedVC == null) {
rejectVariant(ctx, FILTER_CANNOT_LIFTOVER_INDEL);
Expand Down Expand Up @@ -404,6 +420,12 @@ protected int doWork() {
log.info("no successfully lifted variants");
}

if (RECOVER_SWAPPED_REF_ALT) {
log.info(totalTrackedAsSwapRefAlt, " variants were lifted by swapping REF/ALT alleles.");
} else {
log.warn(totalTrackedAsSwapRefAlt, " variants with a swapped REF/ALT were identified, but were not recovered. See RECOVER_SWAPPED_REF_ALT and associated caveats.");
}

rejects.close();
in.close();

Expand Down Expand Up @@ -440,6 +462,12 @@ private void trackLiftedVariantContig(Map<String, Long> map, String contig) {
map.put(contig, ++val);
}

private void addAndTrack(final VariantContext toAdd, final VariantContext source) {
trackLiftedVariantContig(liftedBySourceContig, source.getContig());
trackLiftedVariantContig(liftedByDestContig, toAdd.getContig());
sorter.add(toAdd);
}

/**
* utility function to attempt to add a variant. Checks that the reference allele still matches the reference (which may have changed)
*
Expand All @@ -462,10 +490,14 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re

if (!refString.equalsIgnoreCase(allele.getBaseString())) {
// consider that the ref and the alt may have been swapped in a simple biallelic SNP
if (vc.isBiallelic() && vc.isSNP() &&
refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
sorter.add(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP));
return;
if (vc.isBiallelic() && vc.isSNP() && refString.equalsIgnoreCase(vc.getAlternateAllele(0).getBaseString())) {
if (RECOVER_SWAPPED_REF_ALT) {
totalTrackedAsSwapRefAlt++;
addAndTrack(LiftoverUtils.swapRefAlt(vc, TAGS_TO_REVERSE, TAGS_TO_DROP), source);
return;
} else {
totalTrackedAsSwapRefAlt++;
}
}
mismatchesReference = true;
}
Expand All @@ -481,9 +513,7 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re
failedAlleleCheck++;
trackLiftedVariantContig(rejectsByContig, source.getContig());
} else {
trackLiftedVariantContig(liftedBySourceContig, source.getContig());
trackLiftedVariantContig(liftedByDestContig, vc.getContig());
sorter.add(vc);
addAndTrack(vc, source);
}
}
}
37 changes: 33 additions & 4 deletions src/test/java/picard/util/LiftoverUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.VariantContextComparator;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import picard.vcf.VcfTestUtils;

import java.lang.reflect.Array;
import java.util.*;

import static org.testng.Assert.*;
import java.util.stream.Collectors;

/**
* Created by farjoun on 1/16/18.
Expand Down Expand Up @@ -88,4 +86,35 @@ public Object[][] swapRefAltData() {
public void testSwapRefAlt(final VariantContext swapMe, final VariantContext expected) {
VcfTestUtils.assertEquals(LiftoverUtils.swapRefAlt(swapMe, annotationsToSwap, annotationsToDrop), expected);
}

@DataProvider
public Iterator<Object[]> allelesToStringData() {

final List<Object[]> tests = new ArrayList<>();
tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create("G")), Arrays.asList("A", "G")});
tests.add(new Object[]{Arrays.asList(Allele.create("A", false), Allele.create("G")), Arrays.asList("A", "G")});
tests.add(new Object[]{Arrays.asList(Allele.create("T", true), Allele.create("T")), Arrays.asList("T", "T")});
tests.add(new Object[]{Arrays.asList(Allele.create("*", false), Allele.create("G")), Arrays.asList("*", "G")});
tests.add(new Object[]{Arrays.asList(Allele.create("ATG", true), Allele.create("G")), Arrays.asList("ATG", "G")});
tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create("GGGTGT")), Arrays.asList("A", "GGGTGT")});
tests.add(new Object[]{Arrays.asList(Allele.create("A", false), Allele.create("A")), Arrays.asList("A", "A")});

tests.add(new Object[]{Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), Arrays.asList(Allele.NO_CALL_STRING, Allele.NO_CALL_STRING)});
tests.add(new Object[]{Arrays.asList(Allele.SPAN_DEL, Allele.create("A", true)), Arrays.asList(Allele.SPAN_DEL_STRING, "A")});
tests.add(new Object[]{Arrays.asList(Allele.create("A", true), Allele.create(".")), Arrays.asList("A", Allele.NO_CALL_STRING)});
tests.add(new Object[]{Arrays.asList(Allele.create("TT", true), Allele.create(".")), Arrays.asList("TT", Allele.NO_CALL_STRING)});

return tests.iterator();
}

@Test(dataProvider = "allelesToStringData")
public void testAllelesToString(final List<Allele> input, final List<String> output) {
Assert.assertEquals(LiftoverUtils.allelesToStringList(input), output);

//these should back-convert into the same alleles.
List<Allele> restoredAlleles = output.stream().map(Allele::create).collect(Collectors.toList());
for (int i = 0;i<input.size();i++){
Assert.assertTrue(restoredAlleles.get(i).basesMatch(input.get(i)));
}
}
}
51 changes: 47 additions & 4 deletions src/test/java/picard/util/LiftoverVcfTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ public void testReverseComplementedIndels(final String filename, final int expec
"REJECT=" + rejectOutputFile.getAbsolutePath(),
"CHAIN=" + CHAIN_FILE,
"REFERENCE_SEQUENCE=" + REFERENCE_FILE,
"RECOVER_SWAPPED_REF_ALT=true",
"CREATE_INDEX=false"
};
Assert.assertEquals(runPicardCommandLine(args), 0);
Expand Down Expand Up @@ -109,6 +110,7 @@ public void testChangingInfoFields() {
"REJECT=" + rejectOutputFile.getAbsolutePath(),
"CHAIN=" + CHAIN_FILE,
"REFERENCE_SEQUENCE=" + REFERENCE_FILE,
"RECOVER_SWAPPED_REF_ALT=true",
"CREATE_INDEX=false"
};
Assert.assertEquals(runPicardCommandLine(args), 0);
Expand Down Expand Up @@ -263,6 +265,7 @@ public void testWriteVcfWithFlippedAlleles() throws IOException {
"REJECT=" + rejectOutputFile.getAbsolutePath(),
"CHAIN=" + TWO_INTERVAL_CHAIN_FILE,
"REFERENCE_SEQUENCE=" + TWO_INTERVALS_REFERENCE_FILE,
"RECOVER_SWAPPED_REF_ALT=true",
"CREATE_INDEX=false"
};

Expand Down Expand Up @@ -300,6 +303,7 @@ public void testWriteVcfWithFlippedAllelesNegativeChain() throws IOException {
"REJECT=" + rejectOutputFile.getAbsolutePath(),
"CHAIN=" + CHAIN_FILE,
"REFERENCE_SEQUENCE=" + REFERENCE_FILE,
"RECOVER_SWAPPED_REF_ALT=true",
"CREATE_INDEX=false"
};

Expand Down Expand Up @@ -545,7 +549,39 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r
throw new RuntimeException("not reversed");
}

final VariantContext flipped = LiftoverUtils.liftVariant(source, target, reference, false);
final VariantContext flipped = LiftoverUtils.liftVariant(source, target, reference, false, false);

VcfTestUtils.assertEquals(flipped, result);
}

@DataProvider(name = "indelFlipDataWithOriginalAllele")
public Iterator<Object[]> indelFlipDataWithOriginalAllele() {
final List<Object[]> tests = new ArrayList<>();
indelFlipData().forEachRemaining(x -> {
if (x[2] == null) {
tests.add(x);
} else {
VariantContext source = (VariantContext)x[0];
VariantContextBuilder result_builder = new VariantContextBuilder((VariantContext)x[2]);
result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(source.getAlleles()));
tests.add(new Object[]{x[0], x[1], result_builder.make()});
}
});

return tests.iterator();
}

@Test(dataProvider = "indelFlipDataWithOriginalAllele")
public void testFlipIndelWithOriginalAlleles(final VariantContext source, final ReferenceSequence reference, final VariantContext result) {

final LiftOver liftOver = new LiftOver(CHAIN_FILE);
final Interval originalLocus = new Interval(source.getContig(), source.getStart(), source.getEnd());
final Interval target = liftOver.liftOver(originalLocus);
if (target != null && !target.isNegativeStrand()) {
throw new RuntimeException("not reversed");
}

final VariantContext flipped = LiftoverUtils.liftVariant(source, target, reference, false, true);

VcfTestUtils.assertEquals(flipped, result);
}
Expand All @@ -554,7 +590,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r
public Iterator<Object[]> snpWithChangedRef() {

final ReferenceSequence reference = new ReferenceSequence("chr1", 0,
"CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes());
refString.getBytes());
// 123456789 123456789 123456789 123

final Allele RefT = Allele.create("T", true);
Expand Down Expand Up @@ -1086,6 +1122,7 @@ public Iterator<Object[]> noCallAndSymbolicData() {
builder.start(start).stop(start).alleles(CollectionUtil.makeList(CRef, T, DEL));
result_builder.start(liftedStart).stop(liftedStart).alleles(CollectionUtil.makeList(GRef, A, DEL));
result_builder.attribute(LiftoverVcf.ORIGINAL_START, start);
result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(builder.getAlleles()));
result_builder.attribute(LiftoverUtils.REV_COMPED_ALLELES, true);

genotypeBuilder.alleles(CollectionUtil.makeList(T, DEL));
Expand All @@ -1102,7 +1139,7 @@ public Iterator<Object[]> noCallAndSymbolicData() {
builder.start(start).stop(start).alleles(CollectionUtil.makeList(CRef, T));
result_builder.start(liftedStart).stop(liftedStart).alleles(CollectionUtil.makeList(GRef, A));
result_builder.attribute(LiftoverVcf.ORIGINAL_START, start);

result_builder.attribute(LiftoverVcf.ORIGINAL_ALLELES, LiftoverUtils.allelesToStringList(builder.getAlleles()));
genotypeBuilder.alleles(CollectionUtil.makeList(T, Allele.NO_CALL));
resultGenotypeBuilder.alleles(CollectionUtil.makeList(A, Allele.NO_CALL));
builder.genotypes(genotypeBuilder.make());
Expand All @@ -1120,10 +1157,16 @@ public void testLiftOverNoCallAndSymbolic(final LiftOver liftOver, final Variant

Assert.assertEquals(target.isNegativeStrand(), expectReversed);

VariantContext vc = LiftoverUtils.liftVariant(source, target, REFERENCE, true);
VariantContext vc = LiftoverUtils.liftVariant(source, target, REFERENCE, true, true);
VcfTestUtils.assertEquals(vc, result);

Assert.assertEquals(vc.getAttribute(LiftoverVcf.ORIGINAL_CONTIG), source.getContig());
Assert.assertEquals(vc.getAttribute(LiftoverVcf.ORIGINAL_START), source.getStart());
Assert.assertTrue(source.getAlleles().equals(result.getAlleles()) != result.hasAttribute(LiftoverVcf.ORIGINAL_ALLELES));
if (!source.getAlleles().equals(result.getAlleles())) {
List<String> resultAlleles = new ArrayList<>();
source.getAlleles().forEach(a -> resultAlleles.add(a.getDisplayString()));
Assert.assertEquals(resultAlleles, result.getAttributeAsStringList(LiftoverVcf.ORIGINAL_ALLELES, null));
}
}
}

0 comments on commit 359e3fd

Please sign in to comment.