diff --git a/build.sbt b/build.sbt index 3ce516df..87355439 100644 --- a/build.sbt +++ b/build.sbt @@ -17,7 +17,7 @@ lazy val hadoopVersion = Properties.envOrElse("SPARK_HADOOP_VERSION", DEFAULT_HA dependencyOverrides += "com.google.guava" % "guava" % "15.0" - +libraryDependencies += "org.scala-lang" % "scala-compiler" % "2.12.12" libraryDependencies += "org.seqdoop" % "hadoop-bam" % "7.10.0" libraryDependencies += "org.apache.hadoop" % "hadoop-client" % hadoopVersion libraryDependencies += "org.apache.spark" % "spark-core_2.12" % sparkVersion @@ -28,7 +28,7 @@ libraryDependencies += "com.holdenkarau" % "spark-testing-base_2.12" % "2.4.3_0. libraryDependencies += "org.bdgenomics.adam" %% "adam-core-spark2" % "0.27.0" libraryDependencies += "org.bdgenomics.adam" %% "adam-apis-spark2" % "0.27.0" libraryDependencies += "org.bdgenomics.adam" %% "adam-cli-spark2" % "0.27.0" -libraryDependencies += "org.scala-lang" % "scala-library" % scalaVersion.toString() +libraryDependencies += "org.scala-lang" % "scala-library" % scalaVersion.value libraryDependencies += "org.rogach" %% "scallop" % "3.1.2" libraryDependencies += "org.bdgenomics.utils" % "utils-metrics-spark2_2.12" % "0.2.16" libraryDependencies += "com.github.samtools" % "htsjdk" % "2.19.0" @@ -38,7 +38,6 @@ libraryDependencies += "org.apache.logging.log4j" % "log4j-core" % "2.12.1" libraryDependencies += "org.apache.logging.log4j" % "log4j-api" % "2.12.1" libraryDependencies += "com.intel.gkl" % "gkl" % "0.8.5-1-darwin-SNAPSHOT" libraryDependencies += "com.intel.gkl" % "gkl" % "0.8.5-1-linux-SNAPSHOT" -//libraryDependencies += "org.hammerlab.bam" %% "load" % "1.2.0-M1" libraryDependencies += "de.ruedigermoeller" % "fst" % "2.57" libraryDependencies += "org.apache.commons" % "commons-lang3" % "3.7" libraryDependencies += "org.eclipse.jetty" % "jetty-servlet" % "9.3.24.v20180605" @@ -69,6 +68,18 @@ javaOptions ++= Seq("-Xms512M", "-Xmx8192M", "-XX:+CMSClassUnloadingEnabled" , " //fix for using with hdp warehouse connector javacOptions ++= Seq("-source", "1.8", "-target", "1.8", "-Xlint") +scalacOptions ++=Seq ( + "-opt:unreachable-code", + "-opt:simplify-jumps", + "-opt:redundant-casts", + "-opt:box-unbox" +) +//"-opt:simplify-jumps", +//"-opt:allow-skip-core-module-init", +//"-opt-warnings:any-inline-failed" +// "-opt:l:method", +//"-opt:l:inline", +//"-opt-inline-from:org.biodatageeks.sequila.pileup.model.**", updateOptions := updateOptions.value.withLatestSnapshots(false) outputStrategy := Some(StdoutOutput) diff --git a/src/main/scala/org/biodatageeks/sequila/pileup/PileupMethods.scala b/src/main/scala/org/biodatageeks/sequila/pileup/PileupMethods.scala index 06a23939..b6bc7ff6 100644 --- a/src/main/scala/org/biodatageeks/sequila/pileup/PileupMethods.scala +++ b/src/main/scala/org/biodatageeks/sequila/pileup/PileupMethods.scala @@ -32,7 +32,8 @@ object PileupMethods { val enableInstrumentation = spark.sqlContext.getConf(InternalParams.EnableInstrumentation).toBoolean val alignmentsInstr = if(enableInstrumentation) alignments.instrument() else alignments val storageLevel = - if (spark.sqlContext.getConf(InternalParams.SerializationMode)==StorageLevel.DISK_ONLY.toString()) StorageLevel.DISK_ONLY + if (spark.sqlContext.getConf(InternalParams.SerializationMode, StorageLevel.MEMORY_AND_DISK.toString())==StorageLevel.DISK_ONLY.toString()) + StorageLevel.DISK_ONLY else StorageLevel.MEMORY_AND_DISK //FIXME: Add automatic unpersist diff --git a/src/main/scala/org/biodatageeks/sequila/pileup/model/AlignmentsRDD.scala b/src/main/scala/org/biodatageeks/sequila/pileup/model/AlignmentsRDD.scala index 6febf407..cc35b048 100644 --- a/src/main/scala/org/biodatageeks/sequila/pileup/model/AlignmentsRDD.scala +++ b/src/main/scala/org/biodatageeks/sequila/pileup/model/AlignmentsRDD.scala @@ -91,7 +91,7 @@ case class AlignmentsRDD(rdd: RDD[SAMRecord]) { contigEventAgg.contigLen, util.Arrays.copyOfRange(contigEventAgg.events, 0, maxIndex + 1), //FIXME: https://stackoverflow.com/questions/37969193/why-is-array-slice-so-shockingly-slow contigEventAgg.alts, - contigEventAgg.quals, + contigEventAgg.trimQuals, contigEventAgg.startPosition, contigEventAgg.startPosition + maxIndex, 0, diff --git a/src/main/scala/org/biodatageeks/sequila/pileup/model/ContigAggregate.scala b/src/main/scala/org/biodatageeks/sequila/pileup/model/ContigAggregate.scala index 690177f7..db9c2ba0 100644 --- a/src/main/scala/org/biodatageeks/sequila/pileup/model/ContigAggregate.scala +++ b/src/main/scala/org/biodatageeks/sequila/pileup/model/ContigAggregate.scala @@ -38,6 +38,7 @@ case class ContigAggregate( def getPileupUpdate:PileupUpdate = new PileupUpdate(ArrayBuffer(getTail), ArrayBuffer(getRange)) def getAltPositionsForRange(start: Int, end: Int): SortedSet[Int] = altsKeyCache.range(start,end+1) def addToCache(readQualSummary: ReadQualSummary):Unit = qualityCache.addOrReplace(readQualSummary) + def trimQuals: MultiLociQuals = if(quals != null) quals.trim else null def calculateMaxLength(allPositions: Boolean): Int = { if (! allPositions) @@ -60,8 +61,8 @@ case class ContigAggregate( } } - def updateQuals(pos: Int, alt: Char, quality: Byte, firstUpdate: Boolean = false): Unit = { - quals.updateQuals(pos, alt,quality, firstUpdate) + def updateQuals(pos: Int, alt: Char, quality: Byte, firstUpdate: Boolean = false, updateMax:Boolean = true): Unit = { + quals.updateQuals(pos, alt,quality, firstUpdate, updateMax) } def getTail:Tail ={ @@ -148,7 +149,7 @@ case class ContigAggregate( val reads = correction.qualityCache.getReadsOverlappingPosition(pos) for (read <- reads) { val qual = read.getBaseQualityForPosition(pos.toInt) - adjustedQuals.updateQuals(pos.toInt, QualityConstants.REF_SYMBOL, qual, false) + adjustedQuals.updateQuals(pos.toInt, QualityConstants.REF_SYMBOL, qual, firstUpdate = false, updateMax = false) } } adjustedQuals @@ -173,7 +174,7 @@ case class ContigAggregate( val reads = qualityCache.getReadsOverlappingPositionFullCache(pos) for (read <- reads) { val qual = read.getBaseQualityForPosition(pos.toInt) - qualsInterim.updateQuals(pos.toInt, QualityConstants.REF_SYMBOL, qual, false) + qualsInterim.updateQuals(pos.toInt, QualityConstants.REF_SYMBOL, qual, firstUpdate = false, updateMax = false) } } qualsInterim diff --git a/src/main/scala/org/biodatageeks/sequila/pileup/model/Quals.scala b/src/main/scala/org/biodatageeks/sequila/pileup/model/Quals.scala index 2c296c7c..afaec29b 100644 --- a/src/main/scala/org/biodatageeks/sequila/pileup/model/Quals.scala +++ b/src/main/scala/org/biodatageeks/sequila/pileup/model/Quals.scala @@ -56,17 +56,36 @@ object Quals { map.map({ case (k, v) => k -> v.take(v(QualityConstants.MAX_QUAL_IND) + 1) }) } - def addQualityForAlt(alt: Char, quality: Byte): Unit = { - val altByte = alt.toByte - val qualityIndex = quality - if (!map.contains(altByte)) { - val array = new Array[Short](QualityConstants.QUAL_ARR_SIZE) - array(qualityIndex) = (array(qualityIndex) + 1).toShort - map.update(altByte, array) - } - else - map(altByte)(qualityIndex) = (map(altByte)(qualityIndex) + 1).toShort + def addQualityForAlt(alt: Char, quality: Byte, updateMax:Boolean): Unit = { + val altByte = alt.toByte + val qualityIndex = quality + + if (!map.contains(altByte)) { + val array = new Array[Short](QualityConstants.QUAL_ARR_SIZE) + array(qualityIndex) = 1.toShort // no need for incrementing. first and last time here. + array(QualityConstants.MAX_QUAL_IND) = qualityIndex + map.update(altByte, array) + return + } + + if(updateMax) { + map(altByte)(qualityIndex) = (map(altByte)(qualityIndex) + 1).toShort + if (qualityIndex > map(altByte).last) + map(altByte)(QualityConstants.MAX_QUAL_IND) = qualityIndex + return + } + + if (qualityIndex >= map(altByte).length){ + val array = new Array[Short](QualityConstants.QUAL_ARR_SIZE) + System.arraycopy(map(altByte),0,array, 0, map(altByte).length) + array(qualityIndex) = 1.toShort + map.update(altByte, array) + return } + + map(altByte)(qualityIndex) = (map(altByte)(qualityIndex) + 1).toShort + + } } implicit class MultiLociQualsExtension(val map: Quals.MultiLociQuals) { @@ -75,13 +94,13 @@ object Quals { def trim: MultiLociQuals = map.map({ case (k, v) => k -> v.trim }) @inline - def updateQuals(position: Int, alt: Char, quality: Byte, firstUpdate:Boolean = false): Unit = { + def updateQuals(position: Int, alt: Char, quality: Byte, firstUpdate:Boolean = false, updateMax:Boolean=false): Unit = { if( !firstUpdate || map.contains(position) ) { - map(position).addQualityForAlt(alt, quality) + map(position).addQualityForAlt(alt, quality, updateMax) } else { val singleLocusQualMap = new SingleLocusQuals() - singleLocusQualMap.addQualityForAlt(alt, quality) + singleLocusQualMap.addQualityForAlt(alt, quality, updateMax) map.update(position, singleLocusQualMap) } } diff --git a/src/main/scala/org/biodatageeks/sequila/pileup/model/ReadQualSummary.scala b/src/main/scala/org/biodatageeks/sequila/pileup/model/ReadQualSummary.scala index c2f55e95..63f6ab02 100644 --- a/src/main/scala/org/biodatageeks/sequila/pileup/model/ReadQualSummary.scala +++ b/src/main/scala/org/biodatageeks/sequila/pileup/model/ReadQualSummary.scala @@ -16,12 +16,7 @@ case class ReadQualSummary (start: Int, end: Int, def overlapsPosition(pos: Int): Boolean = !hasDeletionOnPosition(pos) && start <= pos && end >= pos @inline - def relativePosition(absPosition: Int): Int = { - if(!cigarDerivedConf.hasClip) - absPosition - start + inDelEventsOffset(absPosition) - else - absPosition - start + inDelEventsOffset(absPosition) + cigarDerivedConf.leftClipLength - } + def relativePosition(absPosition: Int): Int = absPosition - start + inDelEventsOffset(absPosition) + cigarDerivedConf.leftClipLength @inline private def inDelEventsOffset(pos: Int): Int = {