diff --git a/build.gradle b/build.gradle index 174603ac73..fa88165297 100644 --- a/build.gradle +++ b/build.gradle @@ -89,6 +89,7 @@ dependencies { exclude module: 'htsjdk' } compile 'com.google.guava:guava:15.0' + compile 'org.apache.commons:commons-math3:3.5' compile 'com.github.samtools:htsjdk:' + htsjdkVersion compile 'org.broadinstitute:barclay:2.0.0' compileOnly googleNio diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java index f84f300a52..51364cc38b 100644 --- a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -184,8 +184,7 @@ protected CollectWgsMetrics.WgsMetrics getMetrics(final CountingFilter dupeFilte basesExcludedByCapping, coverageCap, getUnfilteredBaseQHistogram(), - collectWgsMetrics.SAMPLE_SIZE - ); + collectWgsMetrics.SAMPLE_SIZE); } /** diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 7b68935f08..e5096bc894 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -49,9 +49,7 @@ import picard.util.MathUtil; import java.io.File; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; +import java.util.*; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; @@ -122,6 +120,12 @@ public class CollectWgsMetrics extends CommandLineProgram { @ArgumentCollection protected IntervalArgumentCollection intervalArugmentCollection = makeIntervalArgumentCollection(); + @Argument(doc="Output for Theoretical Sensitivity metrics.", optional = true) + public File THEORETICAL_SENSITIVITY_OUTPUT; + + @Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true) + public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); + @Argument(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; @@ -455,6 +459,9 @@ protected int doWork() { if (INTERVALS != null) { IOUtil.assertFileIsReadable(INTERVALS); } + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { + IOUtil.assertFileIsWritable(THEORETICAL_SENSITIVITY_OUTPUT); + } // it doesn't make sense for the locus accumulation cap to be lower than the coverage cap if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) { @@ -491,6 +498,14 @@ protected int doWork() { processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); out.write(OUTPUT); + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { + // Write out theoretical sensitivity results. + final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(), collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION); + theoreticalSensitivityMetrics.addAllMetrics(tsm); + theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); + } + return 0; } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 9b93ab4ffd..3f6841f10b 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -26,13 +26,15 @@ import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.QualityUtil; import picard.PicardException; import picard.util.MathUtil; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Random; +import java.util.*; +import java.util.stream.IntStream; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.math3.distribution.BinomialDistribution; /** * Created by David Benjamin on 5/13/15. @@ -41,14 +43,17 @@ public class TheoreticalSensitivity { private static final Log log = Log.getInstance(TheoreticalSensitivity.class); private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops - private static final int MAX_CONSIDERED_DEPTH = 1000; //no point in looking any deeper than this, otherwise GC overhead is too high. + private static final int MAX_CONSIDERED_DEPTH_HET_SENS = 1000; // No point in looking any deeper than this, otherwise GC overhead is too high. Only used for HET sensitivity. + private static final int LARGE_NUMBER_OF_DRAWS = 10; // The number of draws at which we believe a Gaussian approximation to sum random variables. + private static final double DEPTH_BIN_WIDTH = 0.01; // Minimal fraction of depth histogram to use when integrating theoretical sensitivity. This ensures we don't calculate theoretical sensitivity at every depth, which would be computationally expensive. + private static final int RANDOM_SEED = 51; /** - * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 + * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q - * @param sampleSize sample size is the number of random sums of quality scores for each m - * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, - * for example 5 if the variant likelihood must be 10^5 times greater + * @param sampleSize sample size is the number of random sums of quality scores for each m + * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, + * for example 5 if the variant likelihood must be 10^5 times greater */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold) { @@ -56,16 +61,16 @@ public static double hetSNPSensitivity(final double[] depthDistribution, final d } /** - * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 + * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q - * @param sampleSize sample size is the number of random sums of quality scores for each m - * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, - * for example 5 if the variant likelihood must be 10^5 times greater. - * @param withLogging true to output log messages, false otherwise. + * @param sampleSize sample size is the number of random sums of quality scores for each m + * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, + * for example 5 if the variant likelihood must be 10^5 times greater. + * @param withLogging true to output log messages, false otherwise. */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold, final boolean withLogging) { - final int N = Math.min(depthDistribution.length, MAX_CONSIDERED_DEPTH + 1); + final int N = Math.min(depthDistribution.length, MAX_CONSIDERED_DEPTH_HET_SENS + 1); if (withLogging) log.info("Creating Roulette Wheel"); final RouletteWheel qualitySampler = new RouletteWheel(qualityDistribution); @@ -143,7 +148,7 @@ public static class RouletteWheel { private Random rng; RouletteWheel(final double[] weights) { - rng = new Random(51); + rng = new Random(RANDOM_SEED); N = weights.length; probabilities = new ArrayList<>(); @@ -204,4 +209,183 @@ public static double[] normalizeHistogram(final Histogram histogram) { } return normalizedHistogram; } + + /** + * Determines if a variant would be called under the particular conditions of a given total depth, alt depth, + * average base qualities, allele fraction of variant and log odds threshold necessary to call variant. + * @param totalDepth Depth at the site to be called, both alt and ref. + * @param altDepth Number of alt bases at this site. + * @param sumOfAltQualities Average Phred-scaled quality of bases + * @param alleleFraction Allele fraction we are attempting to detect + * @param logOddsThreshold Log odds threshold necessary for variant to be called + * @return Whether or not the model would call a variant given the parameters + */ + @VisibleForTesting + static boolean isCalled(final int totalDepth, final int altDepth, final double sumOfAltQualities, final double alleleFraction, final double logOddsThreshold) { + final double threshold = 10.0 * (altDepth * -Math.log10(alleleFraction) + (totalDepth - altDepth) * -Math.log10(1.0 - alleleFraction) + logOddsThreshold); + + return sumOfAltQualities > threshold; + } + + public TheoreticalSensitivity() { + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant + * depth. + * @param depth Depth to compute sensitivity at + * @param qualityHistogram Phred-scaled quality score histogram + * @param logOddsThreshold Log odd threshold necessary for variant to be called + * @param sampleSize sampleSize is the total number of simulations to run + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @param randomSeed random number seed to use for random number generator + * @return Theoretical sensitivity for the given arguments at a constant depth. + */ + public static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final long randomSeed) { + final RouletteWheel qualityRW = new RouletteWheel(trimDistribution(normalizeHistogram(qualityHistogram))); + final Random randomNumberGenerator = new Random(randomSeed); + final BinomialDistribution bd = new BinomialDistribution(depth, alleleFraction); + + // Calculate mean and deviation of quality score distribution to enable Gaussian sampling below + final double averageQuality = qualityHistogram.getMean(); + final double standardDeviationQuality = qualityHistogram.getStandardDeviation(); + + int calledVariants = 0; + // Sample simulated variants, and count the number that would get called. The ratio + // of the number called to the total sampleSize is the sensitivity. + for (int sample = 0; sample < sampleSize; sample++) { + final int altDepth = bd.sample(); + + final int sumOfQualities; + if (altDepth < LARGE_NUMBER_OF_DRAWS) { + // If the number of alt reads is "small" we draw from the actual base quality distribution. + sumOfQualities = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); + } else { + // If the number of alt reads is "large" we draw from a Gaussian approximation of the base + // quality distribution to speed up the code. + sumOfQualities = drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian()); + } + + if (isCalled(depth, altDepth, (double) sumOfQualities, alleleFraction, logOddsThreshold)) { + calledVariants++; + } + } + return (double) calledVariants / sampleSize; + } + + /** + * Simulates the sum of base qualities taken from reads that support the alternate allele by + * taking advantage of the fact that the sum of draws from a distribution tends towards a + * Gaussian per the Central Limit Theorem. + * @param altDepth Number of draws to take from base quality distribution + * @param averageQuality Average quality of alt bases + * @param standardDeviationQuality Sample standard deviation of base quality scores + * @param z number of standard deviation from the mean to take sum over + * @return Simulated sum of base qualities the support the alternate allele + */ + static int drawSumOfQScores(final int altDepth, final double averageQuality, final double standardDeviationQuality, final double z) { + return (int) (altDepth * averageQuality + z * Math.sqrt(altDepth) * standardDeviationQuality); + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant + * depth. + * @param depth Depth to compute sensitivity at + * @param qualityHistogram Phred-scaled quality score histogram + * @param logOddsThreshold Log odds threshold necessary for variant to be called + * @param sampleSize the total number of simulations to run + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @return Theoretical sensitivity for the given arguments at a constant depth. + */ + private static double sensitivityAtConstantDepth(final int depth, final Histogram qualityHistogram, final double logOddsThreshold, final int sampleSize, final double alleleFraction) { + return sensitivityAtConstantDepth(depth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction, RANDOM_SEED); + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution and depth + * distribution. + * @param depthHistogram Depth histogram to compute theoretical sensitivity over + * @param qualityHistogram Phred-scaled quality score histogram + * @param sampleSize the total number of simulations to run + * @param logOddsThreshold Log odds threshold necessary for variant to be called + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @return Theoretical sensitivity for the given arguments over a particular depth distribution. + */ + public static double theoreticalSensitivity(final Histogram depthHistogram, final Histogram qualityHistogram, + final int sampleSize, final double logOddsThreshold, final double alleleFraction) { + if (alleleFraction > 1.0 || alleleFraction < 0.0) { + throw new IllegalArgumentException("Allele fractions must be between 0 and 1."); + } + + final double[] depthDistribution = normalizeHistogram(depthHistogram); + + // Integrate sensitivity over depth distribution + double sensitivity = 0.0; + int currentDepth = 0; + double right = 0; + while (currentDepth < depthDistribution.length) { + double deltaDepthProbability = 0.0; + // Accumulate a portion of the depth distribution to compute theoretical sensitivity over. + // This helps prevent us from spending lots of compute over coverages + // that occur with low probability and don't contribute much to sensitivity anyway, but + // it complicates things a bit by having a variable deltaDepthProbability which + // amount of the depth distribution to use with the trapezoid rule integration step. + while (deltaDepthProbability == 0 && currentDepth < depthDistribution.length || + deltaDepthProbability < DEPTH_BIN_WIDTH && currentDepth < depthDistribution.length && + depthDistribution[currentDepth] < DEPTH_BIN_WIDTH / 2.0) { + deltaDepthProbability += depthDistribution[currentDepth]; + currentDepth++; + } + // Calculate sensitivity for a particular depth, and use trapezoid rule to integrate sensitivity. + final double left = right; + right = sensitivityAtConstantDepth(currentDepth, qualityHistogram, logOddsThreshold, sampleSize, alleleFraction); + sensitivity += deltaDepthProbability * (left + right) / 2.0; + } + return sensitivity; + } + + /** + * Removes trailing zeros in a distribution. The purpose of this function is to prevent other + * functions from evaluating in regions where the distribution has zero probability. + * @param distribution Distribution of base qualities + * @return Distribution of base qualities removing any trailing zeros + */ + static double[] trimDistribution(final double[] distribution) { + int endOfDistribution = distribution.length - 1; + while(distribution[endOfDistribution] == 0) { + endOfDistribution--; + } + + // Remove trailing zeros and return. + return Arrays.copyOfRange(distribution, 0, endOfDistribution); + } + + /** + * This is a utility function to calculate the metrics specific to running + * theoretical sensitivity over several different allele fractions. + * @param simulationSize Number of simulations to run at each depth. + * @param depthHistogram Histogram of depth distribution. + * @param baseQHistogram Histogram of Phred-scaled quality scores. + * @param alleleFractions List of allele fractions to measure theoretical sensitivity over. + */ + public static List calculateSensitivities(final int simulationSize, + final Histogram depthHistogram, final Histogram baseQHistogram, final List alleleFractions) { + + final List metricsOverVariousAlleleFractions = new ArrayList<>(); + final double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. + + // For each allele fraction in alleleFractions calculate theoretical sensitivity and add the results + // to the histogram sensitivityHistogram. + for (final double alleleFraction : alleleFractions) { + final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); + theoreticalSensitivityMetrics.ALLELE_FRACTION = alleleFraction; + theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, baseQHistogram, simulationSize, logOddsThreshold, alleleFraction); + theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - theoreticalSensitivityMetrics.THEORETICAL_SENSITIVITY)); + theoreticalSensitivityMetrics.SAMPLE_SIZE = simulationSize; + theoreticalSensitivityMetrics.LOG_ODDS_THRESHOLD = logOddsThreshold; + metricsOverVariousAlleleFractions.add(theoreticalSensitivityMetrics); + } + + return metricsOverVariousAlleleFractions; + } } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java new file mode 100644 index 0000000000..5dae93e846 --- /dev/null +++ b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java @@ -0,0 +1,49 @@ +/* + * The MIT License + * + * Copyright (c) 2018 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.metrics.MetricBase; + +/** + * TheoreticalSensitivityMetrics, are metrics calculated from TheoreticalSensitivity and parameters used in + * the calculation. + * + * These metrics are intended to estimate the achievable sensitivity (for a particular log odds threshold) of + * somatic calls for a particular depth and base quality score distribution. + * + * @author Mark Fleharty + */ +public class TheoreticalSensitivityMetrics extends MetricBase { + /** The allele fraction which theoretical sensitivity is calculated for. */ + public double ALLELE_FRACTION; + /** Estimation of sensitivity at a particular allele fraction. */ + public double THEORETICAL_SENSITIVITY; + /** Phred-scaled value of 1-THEORETICAL_SENSITIVITY. */ + public int THEORETICAL_SENSITIVITY_Q; + /** Log-likelihood ratio is used as a threshold to distinguish a positive site with a given allele fraction from HOM_REF. */ + public double LOG_ODDS_THRESHOLD; + /** Number of samples drawn at each depth in the depth distribution. Larger values allow for increased precision at the cost of compute time. */ + public int SAMPLE_SIZE; +} diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index eb17442e5c..f9dde76c1b 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -16,13 +16,14 @@ import htsjdk.samtools.util.SequenceUtil; import org.broadinstitute.barclay.argparser.Argument; import picard.analysis.MetricAccumulationLevel; +import picard.analysis.TheoreticalSensitivity; +import picard.analysis.TheoreticalSensitivityMetrics; import picard.cmdline.CommandLineProgram; import picard.cmdline.StandardOptionDefinitions; import picard.metrics.MultilevelMetrics; import java.io.File; -import java.util.List; -import java.util.Set; +import java.util.*; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; @@ -98,6 +99,11 @@ protected abstract COLLECTOR makeCollector(final Set ac @Argument(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; + @Argument(doc="Output for Theoretical Sensitivity metrics where the allele fractions are provided by the ALLELE_FRACTION argument.", optional = true) + public File THEORETICAL_SENSITIVITY_OUTPUT; + + @Argument(doc="Allele fraction for which to calculate theoretical sensitivity.", optional = true) + public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); /** * Asserts that files are readable and writable and then fires off an * HsMetricsCalculator instance to do the real work. @@ -156,6 +162,14 @@ protected int doWork() { metrics.write(OUTPUT); + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { + // Write out theoretical sensitivity results. + final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); + List tsm = TheoreticalSensitivity.calculateSensitivities(SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION); + theoreticalSensitivityMetrics.addAllMetrics(tsm); + theoreticalSensitivityMetrics.write(THEORETICAL_SENSITIVITY_OUTPUT); + } + CloserUtil.close(reader); return 0; } diff --git a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java index a8b4578da6..4a8b3f3350 100644 --- a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java +++ b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java @@ -124,6 +124,8 @@ public abstract class TargetMetricsCollector getBaseQualityHistogram() { + return unfilteredBaseQHistogram; + } + + public Histogram getDepthHistogram() { + return unfilteredDepthHistogram; + } } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index c36974088f..a2f33d47fe 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -26,16 +26,15 @@ import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; +import org.apache.commons.math3.distribution.BinomialDistribution; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.List; -import java.util.Arrays; +import java.util.*; import java.io.FileReader; import java.io.File; -import java.util.Scanner; +import java.util.stream.IntStream; /** * Created by davidben on 5/18/15. @@ -207,17 +206,20 @@ public Object[][] hetSensDataProvider() { } @Test(dataProvider = "hetSensDataProvider") - public void testHetSensTargeted(final double expected, final File metricsFile) throws Exception{ + public void testHetSensTargeted(final double expected, final File metricsFile) throws Exception { final double tolerance = 0.000_000_01; - final MetricsFile Metrics = new MetricsFile(); - Metrics.read(new FileReader(metricsFile)); - final List histograms = Metrics.getAllHistograms(); - final Histogram depthHistogram = histograms.get(0); - final Histogram qualityHistogram = histograms.get(1); + final MetricsFile metrics = new MetricsFile(); + try (final FileReader metricsFileReader = new FileReader(metricsFile)) { + metrics.read(metricsFileReader); + } + + final List> histograms = metrics.getAllHistograms(); + final Histogram depthHistogram = histograms.get(0); + final Histogram qualityHistogram = histograms.get(1); - final double [] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); - final double [] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); final int sampleSize = 1_000; final double logOddsThreshold = 3.0; @@ -225,4 +227,176 @@ public void testHetSensTargeted(final double expected, final File metricsFile) t final double result = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, logOddsThreshold); Assert.assertEquals(result, expected, tolerance); } + + @DataProvider(name = "TheoreticalSensitivityConstantDepthDataProvider") + public Object[][] fractionalAlleleSensDataProvider() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); + + // These magic numbers come from a separate implementation of the code in R. + return new Object[][]{ + // Expected sensitivity, metrics file, allele fraction, constant depth, sample size. + {1.00, wgsMetricsFile, .5, 30, 10000, 0.01}, + {0.78, targetedMetricsFile, .1, 30, 10000, 0.02}, + {0.26, targetedMetricsFile, 0.1, 10, 10000, 0.01} + }; + } + + @Test(dataProvider = "TheoreticalSensitivityConstantDepthDataProvider") + public void testSensitivityAtConstantDepth(final double expected, final File metricsFile, final double alleleFraction, final int depth, final int sampleSize, final double tolerance) throws Exception { + // This tests Theoretical Sensitivity assuming a uniform depth with a distribution of base quality scores. + // Because this only tests sensitivity at a constant depth, we use this for testing the code at high depths. + final MetricsFile metrics = new MetricsFile(); + try (final FileReader metricsFileReader = new FileReader(metricsFile)) { + metrics.read(metricsFileReader); + } + + final List> histograms = metrics.getAllHistograms(); + final Histogram qualityHistogram = histograms.get(1); + + // We ensure that even using different random seeds we converge to roughly the same value. + for (int i = 0;i < 3;i++) { + double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityHistogram, 3, sampleSize, alleleFraction, i); + Assert.assertEquals(result, expected, tolerance); + } + } + + @DataProvider(name = "TheoreticalSensitivityDataProvider") + public Object[][] arbFracSensDataProvider() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + + // This test acts primarily as an integration test. The sample size of 200 + // is not quite large enough to converge properly, but is used for the purpose of + // keeping the compute time of the tests short. + return new Object[][] { + {0.90, wgsMetricsFile, 0.5, 200}, + {0.77, wgsMetricsFile, 0.3, 200}, + {0.29, wgsMetricsFile, 0.1, 200}, + {0.08, wgsMetricsFile, 0.05, 200}, + }; + } + + @Test(dataProvider = "TheoreticalSensitivityDataProvider") + public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize) throws Exception { + // This tests Theoretical Sensitivity using distributions on both base quality scores + // and the depth histogram. + + // We use a pretty forgiving tolerance here because for these tests + // we are not using large enough sample sizes to converge. + final double tolerance = 0.02; + + final MetricsFile metrics = new MetricsFile(); + try (final FileReader metricsFileReader = new FileReader(metricsFile)) { + metrics.read(metricsFileReader); + } + + final List> histograms = metrics.getAllHistograms(); + final Histogram depthHistogram = histograms.get(0); + final Histogram qualityHistogram = histograms.get(1); + + final double result = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, alleleFraction); + + Assert.assertEquals(result, expected, tolerance); + } + + @DataProvider(name = "equivalanceHetVsArbitrary") + public Object[][] equivalenceHetVsFull() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); + + return new Object[][] { + // The sample sizes chosen here for these tests are smaller than what would normally be used + // in order to keep the test time low. It should be noted that for larger sample sizes + // the values converge. + {wgsMetricsFile, 0.02, 200}, + {targetedMetricsFile, 0.01, 50} + }; + } + + @Test(dataProvider = "equivalanceHetVsArbitrary") + public void testHetVsArbitrary(final File metricsFile, final double tolerance, final int sampleSize) throws Exception { + // This test compares Theoretical Sensitivity for arbitrary allele fractions with the theoretical het sensitivity + // model. Since allele fraction of 0.5 is equivalent to a het, these should provide the same answer. + final MetricsFile metrics = new MetricsFile(); + try (final FileReader metricsFileReader = new FileReader(metricsFile)) { + metrics.read(metricsFileReader); + } + + final List> histograms = metrics.getAllHistograms(); + final Histogram depthHistogram = histograms.get(0); + final Histogram qualityHistogram = histograms.get(1); + + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + + final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthHistogram, qualityHistogram, sampleSize, 3, 0.5); + final double resultFromTHS = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, 3); + + Assert.assertEquals(resultFromTS, resultFromTHS, tolerance); + } + + @DataProvider(name = "callingThresholdDataProvider") + public Object[][] callingThreshold() { + return new Object[][] { + // These values were tested with an independent implementation in R. + // Test a transition due to a change in the logOddsThreshold + {100, 10, 10*20, .1, 5.8, true}, + {100, 10, 10*20, .1, 5.9, false}, + + // Test a transition due to change in average base quality from 20 to 21 + {100, 10, 10*21, .1, 6.2, true}, + {100, 10, 10*20, .1, 6.2, false}, + + // Test a transition due to change in total depth + {115, 10, 10*21, .1, 6.2, false}, + {114, 10, 10*21, .1, 6.2, true} + }; + } + + @Test(dataProvider = "callingThresholdDataProvider") + public void testCallingThreshold(final int totalDepth, final int altDepth, final double sumOfAltQualities, final double alleleFraction, final double logOddsThreshold, final boolean expectedCall) { + Assert.assertEquals(TheoreticalSensitivity.isCalled(totalDepth, altDepth, sumOfAltQualities, alleleFraction, logOddsThreshold), expectedCall); + } + + @DataProvider(name = "sumOfGaussiansDataProvider") + public Object[][] sumOfGaussians() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); + + // When we sum more base qualities from a particular distribution, it should look increasingly Gaussian. + return new Object[][]{ + {wgsMetricsFile, 500, 0.03}, + {wgsMetricsFile, 20, 0.05}, + {wgsMetricsFile, 10, 0.10}, + {targetedMetricsFile, 500, 0.03}, + {targetedMetricsFile, 20, 0.05}, + {targetedMetricsFile, 10, 0.10} + }; + } + + @Test(dataProvider = "sumOfGaussiansDataProvider") + public void testDrawSumOfQScores(final File metricsFile, final int altDepth, final double tolerance) throws Exception { + final MetricsFile metrics = new MetricsFile<>(); + try (final FileReader metricsFileReader = new FileReader(metricsFile)) { + metrics.read(metricsFileReader); + } + + final List> histograms = metrics.getAllHistograms(); + + final Histogram qualityHistogram = histograms.get(1); + final TheoreticalSensitivity.RouletteWheel qualityRW = new TheoreticalSensitivity.RouletteWheel(TheoreticalSensitivity.trimDistribution(TheoreticalSensitivity.normalizeHistogram(qualityHistogram))); + + final Random randomNumberGenerator = new Random(51); + + // Calculate mean and deviation of quality score distribution to enable Gaussian sampling below + final double averageQuality = qualityHistogram.getMean(); + final double standardDeviationQuality = qualityHistogram.getStandardDeviation(); + + for(int k = 0;k < 1;k++) { + int sumOfQualitiesFull = IntStream.range(0, altDepth).map(n -> qualityRW.draw()).sum(); + int sumOfQualities = TheoreticalSensitivity.drawSumOfQScores(altDepth, averageQuality, standardDeviationQuality, randomNumberGenerator.nextGaussian()); + + Assert.assertEquals(sumOfQualitiesFull, sumOfQualities, sumOfQualitiesFull*tolerance); + } + } } diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index 1856472e72..2f7acc85b3 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -1,12 +1,6 @@ package picard.analysis.directed; -import htsjdk.samtools.SAMException; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMFileWriterFactory; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordSetBuilder; +import htsjdk.samtools.*; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Interval; @@ -15,6 +9,8 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import picard.analysis.TheoreticalSensitivity; +import picard.analysis.TheoreticalSensitivityMetrics; import picard.cmdline.CommandLineProgramTest; import picard.sam.SortSam; import picard.util.TestNGUtil; @@ -23,7 +19,7 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.Random; +import java.util.*; public class CollectTargetedMetricsTest extends CommandLineProgramTest { @@ -31,8 +27,10 @@ public class CollectTargetedMetricsTest extends CommandLineProgramTest { private final File dict = new File(TEST_DATA_DIR+"chrM.reference.dict"); private File tempSamFile; private File outfile; + private File tsOutfile; // Theoretical sensitivity file private File perTargetOutfile; - private static final int LENGTH = 99; + private final static int LENGTH = 99; + private final static int RANDOM_SEED = 51; final String referenceFile = TEST_DATA_DIR + "chrM.reference.fasta"; final String emptyIntervals = TEST_DATA_DIR + "chrM.empty.interval_list"; @@ -79,6 +77,7 @@ void setupBuilder() throws IOException { //Add to setBuilder final SAMRecordSetBuilder setBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate); + setBuilder.setRandomSeed(RANDOM_SEED); setBuilder.setReadGroup(readGroupRecord); setBuilder.setUseNmFlag(true); setBuilder.setHeader(header); @@ -118,8 +117,10 @@ void setupBuilder() throws IOException { //create output files for tests outfile = File.createTempFile("test", ".TargetedMetrics_Coverage"); perTargetOutfile = File.createTempFile("perTarget", ".perTargetCoverage"); + tsOutfile = File.createTempFile("test", ".TheoreticalSensitivityMetrics"); outfile.deleteOnExit(); perTargetOutfile.deleteOnExit(); + tsOutfile.deleteOnExit(); } @DataProvider(name = "targetedIntervalDataProvider") @@ -157,6 +158,59 @@ public void runCollectTargetedMetricsTest(final File input, final File outfile, } } + @DataProvider(name = "theoreticalSensitivityDataProvider") + public Object[][] theoreticalSensitivityDataProvider() { + + return new Object[][] { + // This test is primarily used as an integration test since theoretical sensitivity doesn't converge + // well with a sample size of 1000 rather than 10,000. The sample size is set so low as to prevent + // the tests from taking too long to run. + {tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 1000, + Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction + Arrays.asList(0.01, 0.52, 0.93, 0.99, 0.99) // Expected sensitivity + } + }; + } + + @Test(dataProvider = "theoreticalSensitivityDataProvider") + public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File tsOutfile, final File perTargetOutfile, final String referenceFile, + final String targetIntervals, final int sampleSize, final List alleleFractions, final List expectedSensitivities) throws IOException { + + final String[] args = new String[] { + "TARGET_INTERVALS=" + targetIntervals, + "INPUT=" + input.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + referenceFile, + "PER_TARGET_COVERAGE=" + perTargetOutfile.getAbsolutePath(), + "LEVEL=ALL_READS", + "AMPLICON_INTERVALS=" + targetIntervals, + "THEORETICAL_SENSITIVITY_OUTPUT=" + tsOutfile.getAbsolutePath(), + "SAMPLE_SIZE=" + sampleSize + }; + + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(tsOutfile.getAbsolutePath())); + + // Create map of theoretical sensitivities read from file + final Iterator metrics = output.getMetrics().iterator(); + final Map map = new HashMap<>(); + while (metrics.hasNext()) { + TheoreticalSensitivityMetrics m = metrics.next(); + map.put(m.ALLELE_FRACTION, m.THEORETICAL_SENSITIVITY); + } + + // Compare theoretical sensitivities created by CollectTargetedMetrics + // with those in the test data provider. + final Iterator alleleFraction = alleleFractions.iterator(); + final Iterator expectedSensitivity = expectedSensitivities.iterator(); + while (alleleFraction.hasNext() || expectedSensitivity.hasNext()) { + // Provide wiggle room of 1% in the comparisons + Assert.assertEquals(map.get(alleleFraction.next()), expectedSensitivity.next(), 0.01); + } + } + @Test() public void testRawBqDistributionWithSoftClips() throws IOException { final String input = TEST_DATA_DIR + "chrMReadsWithClips.sam";