From 121a093a773850e028ea4a4e6411aff6e814ea05 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 25 Oct 2024 13:43:25 +0200 Subject: [PATCH 1/4] [MSM] implement skipping windows where all coefficients are zero For scalar field elements modulo p, if all coefficients `a_i` are smaller than some value x < p, their binary representations will have leading zero bits. In this case, we can skip processing the windows corresponding to these leading zeros in the bucket calculation since they would not contribute to the final result. This follows the same idea as in the implementation: https://github.com/privacy-scaling-explorations/halo2curves/pull/168 --- .../math/elliptic/ec_multi_scalar_mul.nim | 245 ++++++++++++------ 1 file changed, 159 insertions(+), 86 deletions(-) diff --git a/constantine/math/elliptic/ec_multi_scalar_mul.nim b/constantine/math/elliptic/ec_multi_scalar_mul.nim index 25c1fc75..bf40584e 100644 --- a/constantine/math/elliptic/ec_multi_scalar_mul.nim +++ b/constantine/math/elliptic/ec_multi_scalar_mul.nim @@ -37,17 +37,51 @@ export bestBucketBitSize, abstractions # # See the litterature references at the top of `ec_multi_scalar_mul_scheduler.nim` +proc determineBitsSet*[bits: static int](coefs: ptr UncheckedArray[BigInt[bits]], N: int): int = + ## Computes the highest bit set in the collection of all MSM coefficients. + ## Useful to avoid bucket computation in all most-significant-windows that + ## are fully zero in all coefficients. + const WordSize = 64 + var acc: BigInt[bits] + let numLimbs = acc.limbs.len + # OR all coefficients + for i in 0 ..< N: + let coef = coefs[i] + for j in 0 ..< numLimbs: + acc.limbs[j] = acc.limbs[j] or coef.limbs[j] + + # Determine most significant set bit + 1 (only first byte set -> msb == 8) + var msb = -1 + block MsbCheck: + + for i in countdown(numLimbs-1, 0): + for j in countdown(WordSize - 1, 0): # Check each bit + let val = (1'u64 shl j) + if (acc.limbs[i].uint64 and val) != 0: + msb = i * WordSize + j + 1 + break MsbCheck + result = msb + func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, HeapAlloc].} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Inner implementation of MSM, for static dispatch over c, the bucket bit length ## This is a straightforward simple translation of BDLO12, section 4 - # Prologue # -------- + let numWindows = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + msb = ceilDiv_vartime(msb, c) # * c + msb + else: + bits.ceilDiv_vartime(c) const numBuckets = 1 shl c - 1 # bucket 0 is unused - const numWindows = bits.ceilDiv_vartime(c) let miniMSMs = allocHeapArray(EC, numWindows) let buckets = allocHeapArray(EC, numBuckets) @@ -88,7 +122,6 @@ func multiScalarMulImpl_reference_vartime[bits: static int, EC, ECaff]( for _ in 0 ..< c: r.double() r ~+= miniMSMs[w] - # Cleanup # ------- buckets.freeHeap() @@ -98,28 +131,29 @@ func multiScalarMul_reference_dispatch_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int) {.tags:[VarTime, HeapAlloc].} = + N: int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ let c = bestBucketBitSize(N, bits, useSignedBuckets = false, useManualTuning = false) case c - of 2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 2) - of 3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 3) - of 4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 4) - of 5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 5) - of 6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 6) - of 7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 7) - of 8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 8) - of 9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 9) - of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10) - of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11) - of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12) - of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13) - of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14) - of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15) - - of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16) + of 2: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 2, useZeroWindows) + of 3: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 3, useZeroWindows) + of 4: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 4, useZeroWindows) + of 5: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 5, useZeroWindows) + of 6: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 6, useZeroWindows) + of 7: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 7, useZeroWindows) + of 8: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 8, useZeroWindows) + of 9: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 9, useZeroWindows) + of 10: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 10, useZeroWindows) + of 11: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 11, useZeroWindows) + of 12: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 12, useZeroWindows) + of 13: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 13, useZeroWindows) + of 14: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..20: multiScalarMulImpl_reference_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() @@ -127,41 +161,45 @@ func multiScalarMul_reference_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int) {.tags:[VarTime, HeapAlloc].} = + N: int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ - multiScalarMul_reference_dispatch_vartime(r, coefs, points, N) + multiScalarMul_reference_dispatch_vartime(r, coefs, points, N, useZeroWindows) -func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff]) {.tags:[VarTime, HeapAlloc].} = +func multiScalarMul_reference_vartime*[EC, ECaff](r: var EC, coefs: openArray[BigInt], points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_reference_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) func multiScalarMul_reference_vartime*[F, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[F], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ let n = cast[int](len) let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64) coefs_big.batchFromField(coefs, n) - r.multiScalarMul_reference_vartime(coefs_big, points, n) + r.multiScalarMul_reference_vartime(coefs_big, points, n, useZeroWindows) freeHeapAligned(coefs_big) func multiScalarMul_reference_vartime*[EC, ECaff]( r: var EC, coefs: openArray[Fr], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc].} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_reference_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) # ########################################################### # # # @@ -256,27 +294,41 @@ func miniMSM[bits: static int, EC, ECaff]( func msmImpl_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, HeapAlloc], meter.} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ # Setup # ----- const numBuckets = 1 shl (c-1) - let buckets = allocHeapArray(EC, numBuckets) for i in 0 ..< numBuckets: buckets[i].setNeutral() # Algorithm # --------- - const excess = bits mod c - const top = bits - excess + var excess = bits mod c + let top = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + if msb < bits - excess: + msb = ceilDiv_vartime(msb, c) * c + excess = 0 + msb + else: # in this case, we don't gain anything by modiying windows + bits - excess + else: + bits - excess + var w = top r.setNeutral() - when top != 0: # Prologue - when excess != 0: + if top != 0: # Prologue + if excess != 0: r.miniMSM(buckets, w, kTopWindow, c, coefs, points, N) w -= c else: @@ -350,10 +402,10 @@ func miniMSM_affine[NumBuckets, QueueLen, EC, ECaff; bits: static int]( func msmAffineImpl_vartime[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + N: int, c: static int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ - # Setup # ----- const (numBuckets, queueLen) = c.deriveSchedulerConstants() @@ -363,13 +415,27 @@ func msmAffineImpl_vartime[bits: static int, EC, ECaff]( # Algorithm # --------- - const excess = bits mod c - const top = bits - excess + var excess = bits mod c + let top = + if not useZeroWindows: + var msb = determineBitsSet[bits](coefs, N) + if msb < 0: # return early, nothing to do, all empty + return + # round up to next window of c + if msb < bits - excess: + msb = ceilDiv_vartime(msb, c) * c + excess = 0 + msb + else: # in this case, we don't gain anything by modiying windows + bits - excess + else: + bits - excess + var w = top r.setNeutral() - when top != 0: # Prologue - when excess != 0: + if top != 0: # Prologue + if excess != 0: # The top might use only a few bits, the affine scheduler would likely have significant collisions for i in 0 ..< numBuckets: sched.buckets.pt[i].setNeutral() @@ -436,7 +502,8 @@ template withEndo[coefsBits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[coefsBits]], points: ptr UncheckedArray[ECaff], - N: int, c: static int) = + N: int, c: static int, + useZeroWindows: bool) = when hasEndomorphismAcceleration(EC.getName()) and EndomorphismThreshold <= coefsBits and coefsBits <= EC.getScalarField().bits() and @@ -447,7 +514,7 @@ template withEndo[coefsBits: static int, EC, ECaff]( let (endoCoefs, endoPoints, endoN) = applyEndomorphism(coefs, points, N) # Given that bits and N changed, we are able to use a bigger `c` # but it has no significant impact on performance - msmProc(r, endoCoefs, endoPoints, endoN, c) + msmProc(r, endoCoefs, endoPoints, endoN, c, useZeroWindows) freeHeap(endoCoefs) freeHeap(endoPoints) else: @@ -459,7 +526,8 @@ template withEndo[coefsBits: static int, EC, ECaff]( func msm_dispatch_vartime[bits: static int, F, G]( r: var (EC_ShortW_Jac[F, G] or EC_ShortW_Prj[F, G]), coefs: ptr UncheckedArray[BigInt[bits]], - points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int) = + points: ptr UncheckedArray[EC_ShortW_Aff[F, G]], N: int, + useZeroWindows: bool) = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ let c = bestBucketBitSize(N, bits, useSignedBuckets = true, useManualTuning = true) @@ -469,29 +537,30 @@ func msm_dispatch_vartime[bits: static int, F, G]( # but it has no significant impact on performance case c - of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2) - of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3) - of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4) - of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5) - of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6) - of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7) - of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8) - - of 9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 9) - of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10) - of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11) - of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12) - of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13) - of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14) - of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15) - - of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16) + of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2, useZeroWindows) + of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3, useZeroWindows) + of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4, useZeroWindows) + of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5, useZeroWindows) + of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6, useZeroWindows) + of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7, useZeroWindows) + of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8, useZeroWindows) + + of 9: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 9, useZeroWindows) + of 10: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows) + of 11: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows) + of 12: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows) + of 13: withEndo(msmAffineImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows) + of 14: msmAffineImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: msmAffineImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..17: msmAffineImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() func msm_dispatch_vartime[bits: static int, F]( r: var EC_TwEdw_Prj[F], coefs: ptr UncheckedArray[BigInt[bits]], - points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int) = + points: ptr UncheckedArray[EC_TwEdw_Aff[F]], N: int, + useZeroWindows: bool) = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ]Pₙ @@ -503,22 +572,22 @@ func msm_dispatch_vartime[bits: static int, F]( # but it has no significant impact on performance case c - of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2) - of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3) - of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4) - of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5) - of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6) - of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7) - of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8) - of 9: withEndo(msmImpl_vartime, r, coefs, points, N, c = 9) - of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10) - of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11) - of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12) - of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13) - of 14: msmImpl_vartime(r, coefs, points, N, c = 14) - of 15: msmImpl_vartime(r, coefs, points, N, c = 15) - - of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16) + of 2: withEndo(msmImpl_vartime, r, coefs, points, N, c = 2, useZeroWindows) + of 3: withEndo(msmImpl_vartime, r, coefs, points, N, c = 3, useZeroWindows) + of 4: withEndo(msmImpl_vartime, r, coefs, points, N, c = 4, useZeroWindows) + of 5: withEndo(msmImpl_vartime, r, coefs, points, N, c = 5, useZeroWindows) + of 6: withEndo(msmImpl_vartime, r, coefs, points, N, c = 6, useZeroWindows) + of 7: withEndo(msmImpl_vartime, r, coefs, points, N, c = 7, useZeroWindows) + of 8: withEndo(msmImpl_vartime, r, coefs, points, N, c = 8, useZeroWindows) + of 9: withEndo(msmImpl_vartime, r, coefs, points, N, c = 9, useZeroWindows) + of 10: withEndo(msmImpl_vartime, r, coefs, points, N, c = 10, useZeroWindows) + of 11: withEndo(msmImpl_vartime, r, coefs, points, N, c = 11, useZeroWindows) + of 12: withEndo(msmImpl_vartime, r, coefs, points, N, c = 12, useZeroWindows) + of 13: withEndo(msmImpl_vartime, r, coefs, points, N, c = 13, useZeroWindows) + of 14: msmImpl_vartime(r, coefs, points, N, c = 14, useZeroWindows) + of 15: msmImpl_vartime(r, coefs, points, N, c = 15, useZeroWindows) + + of 16..17: msmImpl_vartime(r, coefs, points, N, c = 16, useZeroWindows) else: unreachable() @@ -526,43 +595,47 @@ func multiScalarMul_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[BigInt[bits]], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ - msm_dispatch_vartime(r, coefs, points, len) + msm_dispatch_vartime(r, coefs, points, len, useZeroWindows) func multiScalarMul_vartime*[bits: static int, EC, ECaff]( r: var EC, coefs: openArray[BigInt[bits]], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + msm_dispatch_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) func multiScalarMul_vartime*[F, EC, ECaff]( r: var EC, coefs: ptr UncheckedArray[F], points: ptr UncheckedArray[ECaff], - len: int) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = + len: int, + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ let n = cast[int](len) let coefs_big = allocHeapArrayAligned(F.getBigInt(), n, alignment = 64) coefs_big.batchFromField(coefs, n) - r.multiScalarMul_vartime(coefs_big, points, n) + r.multiScalarMul_vartime(coefs_big, points, n, useZeroWindows) freeHeapAligned(coefs_big) func multiScalarMul_vartime*[EC, ECaff]( r: var EC, coefs: openArray[Fr], - points: openArray[ECaff]) {.tags:[VarTime, Alloca, HeapAlloc], inline.} = + points: openArray[ECaff], + useZeroWindows: bool) {.tags:[VarTime, Alloca, HeapAlloc], meter.} = ## Multiscalar multiplication: ## r <- [a₀]P₀ + [a₁]P₁ + ... + [aₙ₋₁]Pₙ₋₁ debug: doAssert coefs.len == points.len let N = points.len - multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N) + multiScalarMul_vartime(r, coefs.asUnchecked(), points.asUnchecked(), N, useZeroWindows) From d5c9a30f3df66507431ecd6ee485ff2b93d83357 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 25 Oct 2024 13:53:45 +0200 Subject: [PATCH 2/4] [tests] add proc to sample random numbers less than X bits in size --- benchmarks/bench_elliptic_parallel_template.nim | 16 ++++++++++++++++ tests/math_elliptic_curves/t_ec_template.nim | 15 +++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim index 647a9f81..41d6cbbe 100644 --- a/benchmarks/bench_elliptic_parallel_template.nim +++ b/benchmarks/bench_elliptic_parallel_template.nim @@ -29,6 +29,22 @@ import export bench_elliptic_template +from std / math import divmod +proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] = + ## Initializes a random BigInt[N] with `maxBit` as the most significant bit + ## of it. + ## If `maxBit` is set to zero, the coefficient will utilize all bits. + const WordSize = 64 + let toShift = result.limbs.len * WordSize - maxBit + let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb + result = rng.random_unsafe(BigInt[N]) + if maxBit == 0 or maxBit >= N: return # use all bits + let limbs = result.limbs.len + for i in countdown(limbs-1, limbs - d): + result.limbs[i] = SecretWord(0'u64) # zero most significant limbs + result.shiftRight(r) # shift right by remaining required + + # ############################################################ # # Parallel Benchmark definitions diff --git a/tests/math_elliptic_curves/t_ec_template.nim b/tests/math_elliptic_curves/t_ec_template.nim index 2e003f3b..a646616b 100644 --- a/tests/math_elliptic_curves/t_ec_template.nim +++ b/tests/math_elliptic_curves/t_ec_template.nim @@ -94,6 +94,21 @@ func random_point*(rng: var RngState, EC: typedesc, randZ: bool, gen: RandomGen) else: result = rng.random_long01Seq_with_randZ(EC) +from std / math import divmod +proc random_coefficient*[N: static int](rng: var RngState, maxBit: int = 0): BigInt[N] = + ## Initializes a random BigInt[N] with `maxBit` as the most significant bit + ## of it. + ## If `maxBit` is set to zero, the coefficient will utilize all bits. + const WordSize = 64 + let toShift = result.limbs.len * WordSize - maxBit + let (d, r) = divmod(toShift, WordSize) # how many limbs to zero & how many bits in next limb + result = rng.random_unsafe(BigInt[N]) + if maxBit == 0 or maxBit >= N: return # use all bits + let limbs = result.limbs.len + for i in countdown(limbs-1, limbs - d): + result.limbs[i] = SecretWord(0'u64) # zero most significant limbs + result.shiftRight(r) # shift right by remaining required + proc run_EC_addition_tests*( ec: typedesc, Iters: static int, From 004d6c5b44559822eccb67ffe6ab1eec2ff65845 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 25 Oct 2024 14:00:43 +0200 Subject: [PATCH 3/4] [bench] add benchmark comparing zero window optimization --- .../bench_ec_msm_coeffs_with_zeroes.nim | 128 ++++++++++++++++++ .../bench_elliptic_parallel_template.nim | 24 ++-- 2 files changed, 144 insertions(+), 8 deletions(-) create mode 100644 benchmarks/bench_ec_msm_coeffs_with_zeroes.nim diff --git a/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim new file mode 100644 index 00000000..e3c160fa --- /dev/null +++ b/benchmarks/bench_ec_msm_coeffs_with_zeroes.nim @@ -0,0 +1,128 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/threadpool, + constantine/named/[algebras, zoo_subgroups], + constantine/math/arithmetic, + constantine/math/ec_shortweierstrass, + # Helpers + helpers/prng_unsafe, + ./bench_elliptic_parallel_template +# ./bench_msm_impl_optional_drop_windows + +# ############################################################ +# +# Benchmark of the G1 group of +# Short Weierstrass elliptic curves +# in (homogeneous) projective coordinates +# +# ############################################################ + +type + BenchTimes = tuple[numInputs: int, bits: int, bAll, bWo, oAll, oWo: float] + +proc msmBench*[EC](ctx: var BenchMsmContext[EC], numInputs: int, iters: int, bits: int): BenchTimes = + const bigIntBits = EC.getScalarField().bits() + type ECaff = affine(EC) + + template coefs: untyped = ctx.coefs.toOpenArray(0, numInputs-1) + template points: untyped = ctx.points.toOpenArray(0, numInputs-1) + + template benchIt(body: untyped): untyped = + block: + var useZeroWindows {.inject.} = true + let startAll = getMonotime() + block: + body + let stopAll = getMonoTime() + useZeroWindows = false + let startWo = getMonoTime() + block: + body + let stopWo = getMonotime() + (all: float inNanoseconds(stopAll - startAll), + wo: float inNanoseconds(stopWo - startWo)) + + var r{.noInit.}: EC + var startNaive, stopNaive, startbaseline, stopbaseline, startopt, stopopt, startpara, stoppara: MonoTime + + let (bAll, bWo) = benchIt: + bench(&"EC multi-scalar-mul baseline {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters): + r.multiScalarMul_reference_vartime(coefs, points, useZeroWindows) + let (oAll, oWo) = benchIt: + bench(&"EC multi-scalar-mul optimized {align($numInputs, 10)} ({bigIntBits}-bit coefs, points), nonZeroBits = {bits}, useZeroWindows = {useZeroWindows}", EC, iters): + r.multiScalarMul_vartime(coefs, points, useZeroWindows) + + let pbAll = bAll / iters.float + let pbWo = bWo / iters.float + let poAll = oAll / iters.float + let poWo = oWo / iters.float + + echo &"total time baseline (useZeroWindows = true) = {bAll / 1e9} s" + echo &"total time baseline (useZeroWindows = false) = {bWo / 1e9} s" + echo &"total time optimized (useZeroWindows = true) = {oAll / 1e9} s" + echo &"total time optimized (useZeroWindows = false) = {oWo / 1e9} s" + + echo &"Speedup ratio baseline with & without all windows: {pbAll / pbWo:>6.3f}x" + echo &"Speedup ratio optimized with & without all windows: {poAll / poWo:>6.3f}x" + echo &"Speedup ratio optimized over baseline with all windows: {pbAll / poAll:>6.3f}x" + echo &"Speedup ratio optimized over baseline without all windows: {pbWo / poWo:>6.3f}x" + + result = (numInputs: numInputs, bits: bits, bAll: bAll, bWo: bWo, oAll: oAll, oWo: oWo) + +const Iters = 10_000 +const AvailableCurves = [ + BLS12_381, +] + +const testNumPoints = [2, 8, 64, 1024, 4096, 65536, 1048576] #, 4194304, 8388608, 16777216] + +template canImport(x: untyped): bool = + compiles: + import x + +when canImport(ggplotnim): + import ggplotnim +else: + {.error: "This benchmarks requires `ggplotnim` to produce a plot of the benchmark results.".} +proc main() = + separator() + staticFor i, 0, AvailableCurves.len: + const curve = AvailableCurves[i] + const maxBits = [1, 32, 128, 512] # [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + var df = newDataFrame() + for bits in maxBits: + var ctx = createBenchMsmContext(EC_ShortW_Jac[Fp[curve], G1], testNumPoints, bits) + separator() + for numPoints in testNumPoints: + let batchIters = max(1, Iters div numPoints) + df.add ctx.msmBench(numPoints, batchIters, bits) + separator() + separator() + echo "\n\n\n" + separator() + separator() + + df = df.gather(["bAll", "bWo", "oAll", "oWo"], "Bench", "Time") + .mutate(f{"Time" ~ `Time` * 1e-9}) + df.writeCsv("/tmp/data.csv") + ggplot(df, aes("numInputs", "Time", shape = "Bench", color = "bits")) + + geom_point() + + scale_x_continuous() + + scale_x_log2(breaks = @testNumPoints) + scale_y_log10() + + xlab("Number of inputs of the MSM") + ylab("Time [s]") + + ggtitle("bits = number of bits set in coefficients") + + margin(right = 4) + + xMargin(0.05) + + theme_scale(1.2) + + ggsave("plots/bench_result.pdf") + +main() +notes() diff --git a/benchmarks/bench_elliptic_parallel_template.nim b/benchmarks/bench_elliptic_parallel_template.nim index 41d6cbbe..19ccac6d 100644 --- a/benchmarks/bench_elliptic_parallel_template.nim +++ b/benchmarks/bench_elliptic_parallel_template.nim @@ -71,11 +71,14 @@ proc multiAddParallelBench*(EC: typedesc, numInputs: int, iters: int) = type BenchMsmContext*[EC] = object tp: Threadpool - numInputs: int - coefs: seq[getBigInt(EC.getName(), kScalarField)] - points: seq[affine(EC)] - -proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmContext[EC] = + numInputs*: int + coefs*: seq[getBigInt(EC.getName(), kScalarField)] + points*: seq[affine(EC)] + +proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int], + maxBit = 0): BenchMsmContext[EC] = + ## `maxBit` sets the maximum bit set in the coefficients that are randomly sampled. + ## Useful to benchmark MSM with many leading zeroes. result.tp = Threadpool.new() let maxNumInputs = inputSizes.max() @@ -86,7 +89,9 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC result.points = newSeq[ECaff](maxNumInputs) result.coefs = newSeq[BigInt[bits]](maxNumInputs) - proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, points: ptr ECaff, coefs: ptr BigInt[bits]) {.nimcall.} = + proc genCoefPointPairsChunk[EC, ECaff](rngSeed: uint64, start, len: int, + points: ptr ECaff, + coefs: ptr BigInt[bits], maxBit: int) {.nimcall.} = let points = cast[ptr UncheckedArray[ECaff]](points) let coefs = cast[ptr UncheckedArray[BigInt[bits]]](coefs) @@ -98,7 +103,7 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC var tmp = threadRng.random_unsafe(EC) tmp.clearCofactor() points[i].affine(tmp) - coefs[i] = threadRng.random_unsafe(BigInt[bits]) + coefs[i] = random_coefficient[bits](threadRng, maxBit) let chunks = balancedChunksPrioNumber(0, maxNumInputs, result.tp.numThreads) @@ -110,7 +115,10 @@ proc createBenchMsmContext*(EC: typedesc, inputSizes: openArray[int]): BenchMsmC syncScope: for (id, start, size) in items(chunks): - result.tp.spawn genCoefPointPairsChunk[EC, ECaff](rng.next(), start, size, result.points[0].addr, result.coefs[0].addr) + result.tp.spawn genCoefPointPairsChunk[EC, ECaff]( + rng.next(), start, size, + result.points[0].addr, result.coefs[0].addr, maxBit + ) # Even if child threads are sleeping, it seems like perf is lower when there are threads around # maybe because the kernel has more overhead or time quantum to keep track off so shut them down. From 24899126b192e527d01418afe993969ca6675704 Mon Sep 17 00:00:00 2001 From: Vindaar Date: Fri, 25 Oct 2024 14:01:48 +0200 Subject: [PATCH 4/4] [tests] add test cases for zero window MSM operations --- .../t_ec_msm_zero_windows_sanity.nim | 25 +++++ ...rtw_jac_g1_msm_zero_windows_comparison.nim | 29 +++++ tests/math_elliptic_curves/t_ec_template.nim | 102 +++++++++++++++++- 3 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim create mode 100644 tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim diff --git a/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim new file mode 100644 index 00000000..08c4a216 --- /dev/null +++ b/tests/math_elliptic_curves/t_ec_msm_zero_windows_sanity.nim @@ -0,0 +1,25 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/named/algebras, + constantine/math/ec_shortweierstrass, + constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +run_EC_multi_scalar_mul_zero_windows_sanity( + ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1], + moduleName = "test_msm_zero_windows_sanity" & $BN254_Snarks + ) + +run_EC_multi_scalar_mul_zero_windows_sanity( + ec = EC_ShortW_Jac[Fp[BLS12_381], G1], + moduleName = "test_msm_zero_windows_sanity" & $BLS12_381 + ) diff --git a/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim new file mode 100644 index 00000000..c7dfadd7 --- /dev/null +++ b/tests/math_elliptic_curves/t_ec_shortw_jac_g1_msm_zero_windows_comparison.nim @@ -0,0 +1,29 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + # Internals + constantine/named/algebras, + constantine/math/ec_shortweierstrass, + constantine/math/arithmetic, + # Test utilities + ./t_ec_template + +const numPoints = [1, 2, 8, 16, 32, 64, 128, 1024, 2048, 4096] # 32768, 262144, 1048576] + +run_EC_multi_scalar_mul_different_zero_windows( + ec = EC_ShortW_Jac[Fp[BN254_Snarks], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BN254_Snarks + ) + +run_EC_multi_scalar_mul_different_zero_windows( + ec = EC_ShortW_Jac[Fp[BLS12_381], G1], + numPoints = numPoints, + moduleName = "test_ec_shortweierstrass_jacobian_multi_scalar_mul_different_zero_windows_" & $BLS12_381 + ) diff --git a/tests/math_elliptic_curves/t_ec_template.nim b/tests/math_elliptic_curves/t_ec_template.nim index a646616b..cbafdd4a 100644 --- a/tests/math_elliptic_curves/t_ec_template.nim +++ b/tests/math_elliptic_curves/t_ec_template.nim @@ -14,7 +14,7 @@ import # Standard library - std/[unittest, times], + std/[unittest, times, strformat], # Internals constantine/platforms/abstractions, constantine/named/algebras, @@ -1405,3 +1405,103 @@ proc run_EC_multi_scalar_mul_impl*[N: static int]( test(ec, gen = Uniform) test(ec, gen = HighHammingWeight) test(ec, gen = Long01Sequence) + +proc run_EC_multi_scalar_mul_zero_windows_sanity*( + ec: typedesc, + moduleName: string) = + # Random seed for reproducibility + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo moduleName, " xoshiro512** seed: ", seed + + const testSuiteDesc = "Elliptic curve multi-scalar-multiplication" + const NumPoints = 4096 + + suite "Scalar coefficients with specific number of zero bits": + const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + for bits in maxBits: + test "Zero bits " & $bits: + proc test(EC: typedesc) = + type T = BigInt[EC.getScalarField().bits()] + var coefs = newSeq[T](NumPoints) + for i in 0 ..< NumPoints: + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + # verify number of bits needed + let msb = determineBitsSet(cast[ptr UncheckedArray[T]](coefs[0].addr), NumPoints) + ## `determineBitsSet` returns the highest set bit in the list of coefficients + 1 + ## and the size of the BigInt if the target size is larger than the BigInt. + check msb == min(bits, bEC.getScalarField().bits()) + test(ec) + +import ../../benchmarks/bench_msm_impl_optional_drop_windows +proc run_EC_multi_scalar_mul_different_zero_windows*[N: static int]( + ec: typedesc, + numPoints: array[N, int], + moduleName: string) = + # Random seed for reproducibility + var rng: RngState + let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32 + rng.seed(seed) + echo "\n------------------------------------------------------\n" + echo moduleName, " xoshiro512** seed: ", seed + + const testSuiteDesc = "Elliptic curve multi-scalar-multiplication with all windows compared to dropping MSB zero windows" + + suite &"{testSuiteDesc} - {$ec} - [{WordBitWidth}-bit mode]": + const maxBits = [1, 8, 16, 32, 64, 128, 256, 512] # how many bits are set in the coefficients + # first check a selection of bits with different number of points + for bits in maxBits: + for n in numPoints: + let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false) + test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})": + proc test(EC: typedesc) = + var points = newSeq[affine(EC)](n) + var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n) + + for i in 0 ..< n: + var tmp = rng.random_unsafe(EC) + tmp.clearCofactor() + points[i].affine(tmp) + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + + var refAll, refWo, optAll, optWo: EC + refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true) + refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false) + optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true) + optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false) + + doAssert bool(refAll == refWo) + doAssert bool(optAll == optWo) + doAssert bool(refWo == optWo) + + test(ec) + + # now check each possible bit explicitly + # Note: this uses a fixed bucket size, but given the above, that should be fine + for bits in 0 ..< ec.getScalarField().bits(): + const n = 32 + let bucketBits = bestBucketBitSize(n, ec.getScalarField().bits(), useSignedBuckets = false, useManualTuning = false) + test &"{$ec} Multi-scalar-mul (N={n}, bucket bits: {bucketBits}, maxBitsSet={bits})": + proc test(EC: typedesc) = + var points = newSeq[affine(EC)](n) + var coefs = newSeq[BigInt[EC.getScalarField().bits()]](n) + + for i in 0 ..< n: + var tmp = rng.random_unsafe(EC) + tmp.clearCofactor() + points[i].affine(tmp) + coefs[i] = random_coefficient[EC.getScalarField().bits()](rng, bits) + + var refAll, refWo, optAll, optWo: EC + refAll.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = true) + refWo.multiScalarMul_reference_vartime(coefs, points, useZeroWindows = false) + optAll.multiScalarMul_vartime(coefs, points, useZeroWindows = true) + optWo.multiScalarMul_vartime(coefs, points, useZeroWindows = false) + + doAssert bool(refAll == refWo) + doAssert bool(optAll == optWo) + doAssert bool(refWo == optWo) + + test(ec)