From a3d5ea1e928db815b5343660b345a061d80a2a98 Mon Sep 17 00:00:00 2001
From: David Baker Effendi <dbe@sun.ac.za>
Date: Fri, 1 Apr 2022 14:56:12 +0200
Subject: [PATCH] :bookmark: Release v1.2.1

---
 CHANGELOG.md                                  |  6 ++
 build.sbt                                     |  2 +-
 .../plume/oss/drivers/OverflowDbDriver.scala  | 79 ++++++++++---------
 .../com/github/plume/oss/DiffTests.scala      |  4 +-
 .../plume/oss/querying/DataFlowTests.scala    | 10 +--
 5 files changed, 54 insertions(+), 47 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fddcfb4e..574d895f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [1.2.1] - 2022-04-01
+
+### Changed
+
+- `OverflowDbDriver::flowsBetween` performance improvement on initial cache preparation.
+
 ## [1.2.0] - 2022-03-31
 
 ### Changed
diff --git a/build.sbt b/build.sbt
index c447d2d7..671a8efc 100644
--- a/build.sbt
+++ b/build.sbt
@@ -3,7 +3,7 @@ name := "Plume"
 inThisBuild(
   List(
     organization := "com.github.plume-oss",
-    version := "1.2.0",
+    version := "1.2.1",
     scalaVersion := "2.13.8",
     crossScalaVersions := Seq("2.13.8", "3.1.1"),
     resolvers ++= Seq(
diff --git a/src/main/scala/com/github/plume/oss/drivers/OverflowDbDriver.scala b/src/main/scala/com/github/plume/oss/drivers/OverflowDbDriver.scala
index 74a6141a..cb7a023a 100644
--- a/src/main/scala/com/github/plume/oss/drivers/OverflowDbDriver.scala
+++ b/src/main/scala/com/github/plume/oss/drivers/OverflowDbDriver.scala
@@ -25,7 +25,7 @@ import java.util.concurrent.ConcurrentHashMap
 import scala.collection.mutable
 import scala.io.{BufferedSource, Source}
 import scala.jdk.CollectionConverters.{IteratorHasAsScala, MapHasAsScala}
-import scala.util.{Failure, Success, Try, Using}
+import scala.util._
 
 /** Driver to create an OverflowDB database file.
   * @param storageLocation where the database will serialize to and deserialize from.
@@ -378,42 +378,18 @@ final case class OverflowDbDriver(
     * @return the source nodes whose data flows to the given sinks uninterrupted.
     */
   def flowsBetween(
-      source: () => Traversal[CfgNode],
-      sink: () => Traversal[CfgNode],
+      source: Traversal[CfgNode],
+      sink: Traversal[CfgNode],
       sanitizers: Set[String] = Set.empty[String]
   ): List[ReachableByResult] =
     PlumeStatistics.time(
       PlumeStatistics.TIME_REACHABLE_BY_QUERYING, {
         import io.shiftleft.semanticcpg.language._
-        // Strip the cache of only nodes that will be used the most in this query to get fast starts/finishes
-        cacheConfig.dataFlowCacheFile match {
-          case Some(_) =>
-            val newCache         = new ResultTable
-            val oldCache         = resultTable.getOrElse(new ResultTable)
-            var currPathsInCache = 0
-            scala.util.Random
-              .shuffle(source().l ++ sink().l)
-              .flatMap { x =>
-                oldCache.get(x) match {
-                  case Some(paths) => Some((x, paths))
-                  case None        => None
-                }
-              }
-              .foreach { case (startOrEndNode, paths) =>
-                if (currPathsInCache + paths.size <= cacheConfig.maxCachedPaths) {
-                  currPathsInCache += paths.size
-                  newCache.add(startOrEndNode, paths)
-                }
-              }
-            oldCache.table.clear()
-            resultTable = Some(newCache)
-            setDataflowContext(context.config.maxCallDepth, context.semantics, resultTable)
-          case _ =>
-        }
 
-        val results: List[ReachableByResult] = sink()
-          .reachableByDetailed(source())(context)
+        prepareInitialTable()
+        val results: List[ReachableByResult] = sink.reachableByDetailed(source)(context)
         captureDataflowCache(results)
+
         results
           // Remove a source/sink arguments referring to itself
           .filter(x => x.path.head.node.astParent != x.path.last.node.astParent)
@@ -431,12 +407,40 @@ final case class OverflowDbDriver(
       }
     )
 
+  private def prepareInitialTable(): Unit = {
+    cacheConfig.dataFlowCacheFile match {
+      case Some(_) =>
+        val oldCache = resultTable.getOrElse(new ResultTable)
+        if (oldCache.table.map(_._2.size).sum <= cacheConfig.maxCachedPaths) {
+          setDataflowContext(context.config.maxCallDepth, context.semantics, Some(oldCache))
+        } else {
+          val newCache         = new ResultTable
+          var currPathsInCache = 0
+          // let the gods decide which entries will go through the maxCachedPaths limit
+          Random
+            .shuffle(oldCache.table.iterator)
+            .takeWhile { case (_, paths) =>
+              currPathsInCache + paths.size <= cacheConfig.maxCachedPaths
+            }
+            .foreach { case (startOrEndNode, paths) =>
+              currPathsInCache += paths.size
+              newCache.add(startOrEndNode, paths)
+            }
+          oldCache.table.clear()
+          resultTable = Some(newCache)
+          setDataflowContext(context.config.maxCallDepth, context.semantics, resultTable)
+        }
+      case _ =>
+    }
+  }
+
   private def captureDataflowCache(results: List[ReachableByResult]): Unit = {
     cacheConfig.dataFlowCacheFile match {
       case Some(_) =>
         // Capture latest results
         resultTable = (results
-          .map(_.table) ++ List(resultTable).flatten).distinct
+          .map(_.table)
+          .distinct ++ List(resultTable).flatten)
           .reduceOption((a: ResultTable, b: ResultTable) => {
             b.table.foreach { case (k, v) => a.add(k, v) }
             a
@@ -467,9 +471,11 @@ final case class OverflowDbDriver(
 
             val newTab = oldTab.table
               .filter { case (k: StoredNode, _) => isNodeUnderTypes(k, unchangedTypes) }
-              .map { case (k: StoredNode, v: Vector[ReachableByResult]) =>
-                val filteredPaths = v.filterNot(isResultExpired)
-                (k, filteredPaths)
+              .flatMap { case (k: StoredNode, v: Vector[ReachableByResult]) =>
+                v.collectFirst { case v: ReachableByResult if isResultExpired(v) => v } match {
+                  case Some(_) => None // discard entry
+                  case None    => Some((k, v))
+                }
               }
               .toMap
             // Refresh old table and add new entries
@@ -482,11 +488,6 @@ final case class OverflowDbDriver(
                 s"Able to re-use ${(leftOverPSize.toDouble / startPSize) * 100.0}% of the saved paths. " +
                   s"Removed ${startPSize - leftOverPSize} expired paths from $startPSize saved paths."
               )
-            setDataflowContext(
-              context.config.maxCallDepth,
-              context.semantics,
-              Some(oldTab)
-            )
           }
         )
       case None => // Do nothing
diff --git a/src/test/scala/com/github/plume/oss/DiffTests.scala b/src/test/scala/com/github/plume/oss/DiffTests.scala
index e322cbc3..77e98bd6 100644
--- a/src/test/scala/com/github/plume/oss/DiffTests.scala
+++ b/src/test/scala/com/github/plume/oss/DiffTests.scala
@@ -102,7 +102,7 @@ class DiffTests extends AnyWordSpec with Matchers with BeforeAndAfterAll {
     val sinkNodesId1   = driver.cpg.call(Operators.addition).id.l
 
     val r1 = driver
-      .flowsBetween( () => driver.cpg.parameter("a"), () => driver.cpg.call(Operators.addition))
+      .flowsBetween(driver.cpg.parameter("a"), driver.cpg.call(Operators.addition))
       .map(_.path.map(_.node.id()))
     val cH1       = QueryEngineStatistics.results()(QueryEngineStatistics.PATH_CACHE_HITS)
     val cM1       = QueryEngineStatistics.results()(QueryEngineStatistics.PATH_CACHE_MISSES)
@@ -119,7 +119,7 @@ class DiffTests extends AnyWordSpec with Matchers with BeforeAndAfterAll {
     val sinkNodesId2   = driver.cpg.call(Operators.addition).id.l
 
     val r2 = driver
-      .flowsBetween(() => driver.cpg.parameter("a"), () => driver.cpg.call(Operators.addition))
+      .flowsBetween(driver.cpg.parameter("a"), driver.cpg.call(Operators.addition))
       .map(_.path.map(_.node.id()))
     val cH2 = QueryEngineStatistics.results()(QueryEngineStatistics.PATH_CACHE_HITS)
     val cM2 = QueryEngineStatistics.results()(QueryEngineStatistics.PATH_CACHE_MISSES)
diff --git a/src/test/scala/com/github/plume/oss/querying/DataFlowTests.scala b/src/test/scala/com/github/plume/oss/querying/DataFlowTests.scala
index 92e9e79c..43f4a312 100644
--- a/src/test/scala/com/github/plume/oss/querying/DataFlowTests.scala
+++ b/src/test/scala/com/github/plume/oss/querying/DataFlowTests.scala
@@ -47,7 +47,7 @@ class DataFlowTests extends Jimple2CpgFixture(Some(new OverflowDbDriver())) {
     val cpg = CPG(driver.cpg.graph)
 
     val r = driver
-      .flowsBetween(() => cpg.parameter("a"), () => cpg.call("<operator>.*"))
+      .flowsBetween(cpg.parameter("a"), cpg.call("<operator>.*"))
     val List(v1) = r.map(r => r.path.map(x => (x.node.method.name, x.node.code)))
 
     v1.head shouldBe ("foo", "int a")
@@ -58,7 +58,7 @@ class DataFlowTests extends Jimple2CpgFixture(Some(new OverflowDbDriver())) {
     val cpg = CPG(driver.cpg.graph)
 
     val r = driver
-      .flowsBetween(() => cpg.parameter("a"), () => cpg.call("bar"))
+      .flowsBetween(cpg.parameter("a"), cpg.call("bar"))
     val List(v1) = r.map(r => r.path.map(x => (x.node.method.name, x.node.code)))
 
     v1.head shouldBe ("foo", "int a")
@@ -69,7 +69,7 @@ class DataFlowTests extends Jimple2CpgFixture(Some(new OverflowDbDriver())) {
     val cpg = CPG(driver.cpg.graph)
 
     val r = driver
-      .flowsBetween(() => cpg.parameter("a"), () => cpg.call("println"))
+      .flowsBetween(cpg.parameter("a"), cpg.call("println"))
 
     r.map(r => r.path.map(x => (x.node.method.name, x.node.code))).foreach(println)
 
@@ -89,11 +89,11 @@ class DataFlowTests extends Jimple2CpgFixture(Some(new OverflowDbDriver())) {
     def source = cpg.call("taint").argument
     def sink   = cpg.call("baz")
 
-    val r1 = driver.flowsBetween(() => source, () => sink)
+    val r1 = driver.flowsBetween(source, sink)
     r1.map(r => r.path.map(x => (x.node.method.name, x.node.code))).foreach(println)
     r1.size shouldBe 1
 
-    val r2 = driver.flowsBetween(() => source, () => sink, Set("Foo.falseClean:int(int)"))
+    val r2 = driver.flowsBetween(source, sink, Set("Foo.falseClean:int(int)"))
     r2.size shouldBe 0
   }