mratsim · Vindaar · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim
@@ -0,0 +1,128 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  # Internals
+  constantine/threadpool,
+  constantine/named/[algebras, zoo_subgroups],
+  constantine/math/arithmetic,
+  constantine/math/ec_shortweierstrass,
+  # Helpers
+  helpers/prng_unsafe,
+  ./bench_elliptic_parallel_template
+#  ./bench_msm_impl_optional_drop_windows
+
+# ############################################################
+#
+#               Benchmark of the G1 group of
+#            Short Weierstrass elliptic curves
+#          in (homogeneous) projective coordinates
+#
+# ############################################################
+
+type
+  BenchTimes = tuple[numInputs: int, bits: int, bAll, bWo, oAll, oWo: float]
+
+proc msmBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int, bits: int): BenchTimes =
+  const bigIntBits = EC.getScalarField().bits()
+  type ECaff = affine(EC)
+
+  template coefs: untyped = ctx.coefs.toOpenArray(0, numInputs-1)
+  template points: untyped = ctx.points.toOpenArray(0, numInputs-1)
+
+  template benchIt(body: untyped): untyped =
+    block:
+      var useZeroWindows {.inject.} = true
+      let startAll = getMonotime()
+      block:
+        body
+      let stopAll = getMonoTime()
+      useZeroWindows = false
+      let startWo = getMonoTime()
+      block:
+        body
+      let stopWo = getMonotime()
+      (all: float inNanoseconds(stopAll - startAll),
+       wo:  float inNanoseconds(stopWo  - startWo))
+
+  var r{.noInit.}: EC
+  var startNaive, stopNaive, startbaseline, stopbaseline, startopt, stopopt, startpara, stoppara: MonoTime
+
+  let (bAll, bWo) = benchIt:
+    bench(&"EC multi-scalar-mul baseline  {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
+      r.multiScalarMul_reference_vartime(coefs, points, useZeroWindows)
+  let (oAll, oWo) = benchIt:
+    bench(&"EC multi-scalar-mul optimized {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
+      r.multiScalarMul_vartime(coefs, points, useZeroWindows)
+
+  let pbAll = bAll / iters.float
+  let pbWo  = bWo / iters.float
+  let poAll = oAll / iters.float
+  let poWo  = oWo / iters.float
+
+  echo &"total time baseline  (useZeroWindows = true)  = {bAll / 1e9} s"
+  echo &"total time baseline  (useZeroWindows = false) = {bWo  / 1e9} s"
+  echo &"total time optimized (useZeroWindows = true)  = {oAll / 1e9} s"
+  echo &"total time optimized (useZeroWindows = false) = {oWo  / 1e9} s"
+
+  echo &"Speedup ratio baseline with & without all windows:         {pbAll / pbWo:>6.3f}x"
+  echo &"Speedup ratio optimized with & without all windows:        {poAll / poWo:>6.3f}x"
+  echo &"Speedup ratio optimized over baseline with all windows:    {pbAll / poAll:>6.3f}x"
+  echo &"Speedup ratio optimized over baseline without all windows: {pbWo  / poWo:>6.3f}x"
+
+  result = (numInputs: numInputs, bits: bits, bAll: bAll, bWo: bWo, oAll: oAll, oWo: oWo)
+
+const Iters = 10_000
+const AvailableCurves = [
+  BLS12_381,
+]
+
+const testNumPoints = [2, 8, 64, 1024, 4096, 65536, 1048576] #, 4194304, 8388608, 16777216]
+
+template canImport(x: untyped): bool =
+  compiles:
+    import x
+
+when canImport(ggplotnim):
+  import ggplotnim
+else:
+  {.error: "This benchmarks requires `ggplotnim` to produce a plot of the benchmark results.".}
+proc main() =
+  separator()
+  staticFor i, 0, AvailableCurves.len:
+    const curve = AvailableCurves[i]
+    const maxBits = [1, 32, 128, 512] # [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients
+    var df = newDataFrame()
+    for bits in maxBits:
+      var ctx = createBenchMsmContext(EC_ShortW_Jac[Fp[curve], G1], testNumPoints, bits)
+      separator()
+      for numPoints in testNumPoints:
+        let batchIters = max(1, Iters div numPoints)
+        df.add ctx.msmBench(numPoints, batchIters, bits)
+        separator()
+      separator()
+      echo "\n\n\n"
+      separator()
+    separator()
+
+    df = df.gather(["bAll", "bWo", "oAll", "oWo"], "Bench", "Time")
+      .mutate(f{"Time" ~ `Time` * 1e-9})
+    df.writeCsv("/tmp/data.csv")
+    ggplot(df, aes("numInputs", "Time", shape = "Bench", color = "bits")) +
+      geom_point() +
+      scale_x_continuous() +
+      scale_x_log2(breaks = @testNumPoints) + scale_y_log10() +
+      xlab("Number of inputs of the MSM") + ylab("Time [s]") +
+      ggtitle("bits = number of bits set in coefficients") +
+      margin(right = 4) +
+      xMargin(0.05) +
+      theme_scale(1.2) +
+      ggsave("plots/bench_result.pdf")
+
+main()
+notes()
diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim
@@ -29,6 +29,22 @@ import
 
 export bench_elliptic_template
 
+from std / math import divmod
+proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] =
+  ## Initializes a random BigInt[N] with `maxBit` as the most significant bit
+  ## of it.
+  ## If `maxBit` is set to zero, the coefficient will utilize all bits.
+  const WordSize = 64
+  let toShift = result.limbs.len * WordSize - maxBit
+  let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb
+  result = rng.random_unsafe(BigInt[N])
+  if maxBit == 0 or maxBit >= N: return # use all bits
+  let limbs = result.limbs.len
+  for i in countdown(limbs-1, limbs - d):
+    result.limbs[i] = SecretWord(0'u64)  # zero most significant limbs
+  result.shiftRight(r)                   # shift right by remaining required
+
+
 # ############################################################
 #
 #             Parallel Benchmark definitions
@@ -55,11 +71,14 @@ proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) =
 
 type BenchMsmContext*[EC] = object
   tp: Threadpool
-  numInputs: int
-  coefs: seq[getBigInt(EC.getName(), kScalarField)]
-  points: seq[affine(EC)]
-
-proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] =
+  numInputs*: int
+  coefs*: seq[getBigInt(EC.getName(), kScalarField)]
+  points*: seq[affine(EC)]
+
+proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int],
+                            maxBit = 0): BenchMsmContext[EC] =
+  ## `maxBit` sets the maximum bit set in the coefficients that are randomly sampled.
+  ## Useful to benchmark MSM with many leading zeroes.
   result.tp = Threadpool.new()
   let maxNumInputs = inputSizes.max()
 
@@ -70,7 +89,9 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
   result.points = newSeq[ECaff](maxNumInputs)
   result.coefs = newSeq[BigInt[bits]](maxNumInputs)
 
-  proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, points: ptr ECaff, coefs: ptr BigInt[bits]) {.nimcall.} =
+  proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int,
+                                         points: ptr ECaff,
+                                         coefs: ptr BigInt[bits], maxBit: int) {.nimcall.} =
     let points = cast[ptr UncheckedArray[ECaff]](points)
     let coefs = cast[ptr UncheckedArray[BigInt[bits]]](coefs)
 
@@ -82,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
       var tmp = threadRng.random_unsafe(EC)
       tmp.clearCofactor()
       points[i].affine(tmp)
-      coefs[i] = threadRng.random_unsafe(BigInt[bits])
+      coefs[i] = random_coefficient[bits](threadRng, maxBit)
 
   let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)
 
@@ -94,7 +115,10 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
 
   syncScope:
     for (id, start, size) in items(chunks):
-      result.tp.spawn genCoefPointPairsChunk[EC, ECaff](rng.next(), start, size, result.points[0].addr, result.coefs[0].addr)
+      result.tp.spawn genCoefPointPairsChunk[EC, ECaff](
+        rng.next(), start, size,
+        result.points[0].addr, result.coefs[0].addr, maxBit
+      )
 
   # Even if child threads are sleeping, it seems like perf is lower when there are threads around
   # maybe because the kernel has more overhead or time quantum to keep track off so shut them down.