ec-nvidia: switch to 32-bit due to upstream fused-multiply-add-with-c…

…arry 64-bit bug
mratsim · Oct 27, 2024 · bea5c1a · bea5c1a
1 parent 1529eb6
commit bea5c1a
Show file tree

Hide file tree

Showing 13 changed files with 46 additions and 505 deletions.
diff --git a/constantine.nimble b/constantine.nimble
@@ -622,7 +622,6 @@ const testDescNvidia: seq[string] = @[
   "tests/gpu/t_nsqr.nim",
   "tests/gpu/t_nsqr_rt.nim",
   "tests/gpu/t_ec_jac_coords.nim",
-  "tests/gpu/t_ec_sum_port.nim",
   "tests/gpu/t_ec_sum.nim"
 ]
 

diff --git a/constantine/math_compiler/impl_curves_ops_affine.nim b/constantine/math_compiler/impl_curves_ops_affine.nim
@@ -90,7 +90,7 @@ proc isNeutralAff*(asy: Assembler_LLVM, cd: CurveDescriptor, r, a: ValueRef) {.u
 
     let (ri, ai) = llvmParams
 
-    let P = asy.asEcPointAff(ed, ai)
+    let P = asy.asEcPointAff(cd, ai)
 
     asy.store(ri, P.x.isZero() and P.y.isZero())
 

diff --git a/docs/crypto-nvidia_gpus.md b/docs/crypto-nvidia_gpus.md
@@ -4,6 +4,13 @@ This documentation references useful information for implementing and optimizing
 
 ## Integer instruction bug
 
+### Integer FMA with carry-in uint64
+
+We get incorrect result for modular multiplication with 64-bit limbs due to a fused-multiuply-add with carry bug.
+
+- https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
+- https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
+
 ### Integer FMA with carry-in uint32
 
 The instruction integer fused-multiply-add  with carry-in may
@@ -57,12 +64,6 @@ int main() {
 }
 ```
 
-### Integer FMA with carry-in uint64
-
-https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
-
-Unfortunately it seems like we're also hit by this on uint64.
-
 
 ## The hidden XMAD instruction
 

diff --git a/tests/gpu/t_ccopy.nim b/tests/gpu/t_ccopy.nim
@@ -63,4 +63,4 @@ let a = Fp[BN254_Snarks].fromHex("0x12345678FF11FFAA00321321CAFECAFE")
 let b = Fp[BN254_Snarks].fromHex("0xDEADBEEFDEADBEEFDEADBEEFDEADBEEF")
 
 
-testName(Fp[BN254_Snarks], 64, a, b)
+testName(Fp[BN254_Snarks], 32, a, b)
diff --git a/tests/gpu/t_cneg.nim b/tests/gpu/t_cneg.nim
@@ -46,4 +46,4 @@ proc testName[Name: static Algebra](field: type FF[Name], wordSize: int, a: FF[N
 
 let a = Fp[BN254_Snarks].fromHex("0x12345678FF11FFAA00321321CAFECAFE")
 
-testName(Fp[BN254_Snarks], 64, a)
+testName(Fp[BN254_Snarks], 32, a)
diff --git a/tests/gpu/t_ec_jac_coords.nim b/tests/gpu/t_ec_jac_coords.nim
@@ -14,7 +14,7 @@ import
   constantine/math/elliptic/ec_shortweierstrass_jacobian,
   constantine/platforms/abstractions,
   constantine/platforms/llvm/llvm,
-  constantine/math_compiler/[ir, pub_fields, pub_curves_jacobian, codegen_nvidia, impl_fields_globals],
+  constantine/math_compiler/[ir, pub_fields, pub_curves_jacobian, codegen_nvidia, impl_fields_globals, impl_curves_ops_jacobian],
   # Test utilities
   helpers/prng_unsafe
 
@@ -28,7 +28,7 @@ template genGetComponent*(asy: Assembler_LLVM, cd: CurveDescriptor, fn: typed):
     let rA = asy.asField(r, cd.fd.fieldTy)
 
     let x = fn(ec)
-    asy.store(rA, x)
+    rA.store(x)
 
     asy.br.retVoid()
   name
@@ -71,6 +71,6 @@ let y = "0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a"
 let pt = EC_ShortW_Jac[Fp[BN254_Snarks], G1].fromHex(x, y)
 echo pt.toHex()
 
-testX(Fp[BN254_Snarks], 64, pt)
-testY(Fp[BN254_Snarks], 64, pt)
-testZ(Fp[BN254_Snarks], 64, pt)
+testX(Fp[BN254_Snarks], 32, pt)
+testY(Fp[BN254_Snarks], 32, pt)
+testZ(Fp[BN254_Snarks], 32, pt)
diff --git a/tests/gpu/t_ec_sum.nim b/tests/gpu/t_ec_sum.nim
@@ -81,12 +81,12 @@ let pt2 = EC_ShortW_Jac[Fp[BN254_Snarks], G1].fromHex(x2, y2)
 ## If `skipFinalSub` is set to `true` in the EC sum implementation
 ##   `S1.prod(Q.z, Z2Z2, skipFinalSub = true)`
 ## the following fails at iteration 57.
-testSum(Fp[BN254_Snarks], 64, pt, pt2)
+testSum(Fp[BN254_Snarks], 32, pt, pt2)
 
 var pt2Aff: EC_ShortW_Aff[Fp[BN254_Snarks], G1]
 pt2Aff.affine(pt2)
 
-testMixedSum(Fp[BN254_Snarks], 64, pt, pt2Aff)
+testMixedSum(Fp[BN254_Snarks], 32, pt, pt2Aff)
 
 
 ## NOTE: While these inputs a, b are the ones that end up causing the
@@ -115,7 +115,7 @@ block SkipFinalSubIssue:
   let y2 = "0x0f53265870f65aa18bded3ccb9c62a4d8b060a32a05a75d455710bce95a991df"
   let b = EC_ShortW_Jac[Fp[BN254_Snarks], G1].fromHex(x2, y2)
 
-  let nv = initNvAsm(EC_ShortW_Jac[Fp[BN254_Snarks], G1], 64)
+  let nv = initNvAsm(EC_ShortW_Jac[Fp[BN254_Snarks], G1], 32)
   let kernel = nv.compile(genEcMixedSum)
 
   template checkSum(a, b): untyped =