Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Drop (leading) windows in MSM where all coefficients zero #479

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions benchmarks/bench_ec_msm_coeffs_with_zeroes.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Constantine
# Copyright (c) 2018-2019 Status Research & Development GmbH
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
# Licensed and distributed under either of
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
# at your option. This file may not be copied, modified, or distributed except according to those terms.

import
# Internals
constantine/threadpool,
constantine/named/[algebras, zoo_subgroups],
constantine/math/arithmetic,
constantine/math/ec_shortweierstrass,
# Helpers
helpers/prng_unsafe,
./bench_elliptic_parallel_template
# ./bench_msm_impl_optional_drop_windows

# ############################################################
#
# Benchmark of the G1 group of
# Short Weierstrass elliptic curves
# in (homogeneous) projective coordinates
#
# ############################################################

type
BenchTimes = tuple[numInputs: int, bits: int, bAll, bWo, oAll, oWo: float]

proc msmBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int, bits: int): BenchTimes =
const bigIntBits = EC.getScalarField().bits()
type ECaff = affine(EC)

template coefs: untyped = ctx.coefs.toOpenArray(0, numInputs-1)
template points: untyped = ctx.points.toOpenArray(0, numInputs-1)

template benchIt(body: untyped): untyped =
block:
var useZeroWindows {.inject.} = true
let startAll = getMonotime()
block:
body
let stopAll = getMonoTime()
useZeroWindows = false
let startWo = getMonoTime()
block:
body
let stopWo = getMonotime()
(all: float inNanoseconds(stopAll - startAll),
wo: float inNanoseconds(stopWo - startWo))

var r{.noInit.}: EC
var startNaive, stopNaive, startbaseline, stopbaseline, startopt, stopopt, startpara, stoppara: MonoTime

let (bAll, bWo) = benchIt:
bench(&"EC multi-scalar-mul baseline {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
r.multiScalarMul_reference_vartime(coefs, points, useZeroWindows)
let (oAll, oWo) = benchIt:
bench(&"EC multi-scalar-mul optimized {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters):
r.multiScalarMul_vartime(coefs, points, useZeroWindows)

let pbAll = bAll / iters.float
let pbWo = bWo / iters.float
let poAll = oAll / iters.float
let poWo = oWo / iters.float

echo &"total time baseline (useZeroWindows = true) = {bAll / 1e9} s"
echo &"total time baseline (useZeroWindows = false) = {bWo / 1e9} s"
echo &"total time optimized (useZeroWindows = true) = {oAll / 1e9} s"
echo &"total time optimized (useZeroWindows = false) = {oWo / 1e9} s"

echo &"Speedup ratio baseline with & without all windows: {pbAll / pbWo:>6.3f}x"
echo &"Speedup ratio optimized with & without all windows: {poAll / poWo:>6.3f}x"
echo &"Speedup ratio optimized over baseline with all windows: {pbAll / poAll:>6.3f}x"
echo &"Speedup ratio optimized over baseline without all windows: {pbWo / poWo:>6.3f}x"

result = (numInputs: numInputs, bits: bits, bAll: bAll, bWo: bWo, oAll: oAll, oWo: oWo)

const Iters = 10_000
const AvailableCurves = [
BLS12_381,
]

const testNumPoints = [2, 8, 64, 1024, 4096, 65536, 1048576] #, 4194304, 8388608, 16777216]

template canImport(x: untyped): bool =
compiles:
import x

when canImport(ggplotnim):
import ggplotnim
else:
{.error: "This benchmarks requires `ggplotnim` to produce a plot of the benchmark results.".}
proc main() =
separator()
staticFor i, 0, AvailableCurves.len:
const curve = AvailableCurves[i]
const maxBits = [1, 32, 128, 512] # [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients
var df = newDataFrame()
for bits in maxBits:
var ctx = createBenchMsmContext(EC_ShortW_Jac[Fp[curve], G1], testNumPoints, bits)
separator()
for numPoints in testNumPoints:
let batchIters = max(1, Iters div numPoints)
df.add ctx.msmBench(numPoints, batchIters, bits)
separator()
separator()
echo "\n\n\n"
separator()
separator()

df = df.gather(["bAll", "bWo", "oAll", "oWo"], "Bench", "Time")
.mutate(f{"Time" ~ `Time` * 1e-9})
df.writeCsv("/tmp/data.csv")
ggplot(df, aes("numInputs", "Time", shape = "Bench", color = "bits")) +
geom_point() +
scale_x_continuous() +
scale_x_log2(breaks = @testNumPoints) + scale_y_log10() +
xlab("Number of inputs of the MSM") + ylab("Time [s]") +
ggtitle("bits = number of bits set in coefficients") +
margin(right = 4) +
xMargin(0.05) +
theme_scale(1.2) +
ggsave("plots/bench_result.pdf")

main()
notes()
40 changes: 32 additions & 8 deletions benchmarks/bench_elliptic_parallel_template.nim
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,22 @@ import

export bench_elliptic_template

from std / math import divmod
proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] =
## Initializes a random BigInt[N] with `maxBit` as the most significant bit
## of it.
## If `maxBit` is set to zero, the coefficient will utilize all bits.
const WordSize = 64
let toShift = result.limbs.len * WordSize - maxBit
let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb
result = rng.random_unsafe(BigInt[N])
if maxBit == 0 or maxBit >= N: return # use all bits
let limbs = result.limbs.len
for i in countdown(limbs-1, limbs - d):
result.limbs[i] = SecretWord(0'u64) # zero most significant limbs
result.shiftRight(r) # shift right by remaining required


# ############################################################
#
# Parallel Benchmark definitions
Expand All @@ -55,11 +71,14 @@ proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) =

type BenchMsmContext*[EC] = object
tp: Threadpool
numInputs: int
coefs: seq[getBigInt(EC.getName(), kScalarField)]
points: seq[affine(EC)]

proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] =
numInputs*: int
coefs*: seq[getBigInt(EC.getName(), kScalarField)]
points*: seq[affine(EC)]

proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int],
maxBit = 0): BenchMsmContext[EC] =
## `maxBit` sets the maximum bit set in the coefficients that are randomly sampled.
## Useful to benchmark MSM with many leading zeroes.
result.tp = Threadpool.new()
let maxNumInputs = inputSizes.max()

Expand All @@ -70,7 +89,9 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
result.points = newSeq[ECaff](maxNumInputs)
result.coefs = newSeq[BigInt[bits]](maxNumInputs)

proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, points: ptr ECaff, coefs: ptr BigInt[bits]) {.nimcall.} =
proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int,
points: ptr ECaff,
coefs: ptr BigInt[bits], maxBit: int) {.nimcall.} =
let points = cast[ptr UncheckedArray[ECaff]](points)
let coefs = cast[ptr UncheckedArray[BigInt[bits]]](coefs)

Expand All @@ -82,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC
var tmp = threadRng.random_unsafe(EC)
tmp.clearCofactor()
points[i].affine(tmp)
coefs[i] = threadRng.random_unsafe(BigInt[bits])
coefs[i] = random_coefficient[bits](threadRng, maxBit)

let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads)

Expand All @@ -94,7 +115,10 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC

syncScope:
for (id, start, size) in items(chunks):
result.tp.spawn genCoefPointPairsChunk[EC, ECaff](rng.next(), start, size, result.points[0].addr, result.coefs[0].addr)
result.tp.spawn genCoefPointPairsChunk[EC, ECaff](
rng.next(), start, size,
result.points[0].addr, result.coefs[0].addr, maxBit
)

# Even if child threads are sleeping, it seems like perf is lower when there are threads around
# maybe because the kernel has more overhead or time quantum to keep track off so shut them down.
Expand Down
Loading
Loading