diff --git a/constantine.nimble b/constantine.nimble
index 31086b2c..e1b03079 100644
--- a/constantine.nimble
+++ b/constantine.nimble
@@ -496,7 +496,7 @@ const testDesc: seq[tuple[path: string, useGMP: bool]] = @[
   ("tests/t_ethereum_evm_precompiles.nim", false),
   ("tests/t_ethereum_bls_signatures.nim", false),
   ("tests/t_ethereum_eip2333_bls12381_key_derivation.nim", false),
-  ("tests/t_ethereum_eip4844_kzg.nim", false),
+  ("tests/t_ethereum_eip4844_deneb_kzg.nim", false),
 ]
 
 const testDescNvidia: seq[string] = @[
diff --git a/constantine/commitments/kzg_polynomial_commitments.nim b/constantine/commitments/kzg_polynomial_commitments.nim
index 7bc4b326..16d09da1 100644
--- a/constantine/commitments/kzg_polynomial_commitments.nim
+++ b/constantine/commitments/kzg_polynomial_commitments.nim
@@ -190,7 +190,8 @@ func kzg_prove*[N: static int, C: static Curve](
        poly: PolynomialEval[N, Fr[C]],
        domain: PolyDomainEval[N, Fr[C]],
        challenge: Fr[C],
-       powers_of_tau: PolynomialEval[N, G1aff[C]]) =
+       powers_of_tau: PolynomialEval[N, G1aff[C]],
+       isBitReversedDomain: static bool) =
 
   # Note:
   #   The order of inputs in
@@ -227,7 +228,7 @@ func kzg_prove*[N: static int, C: static Curve](
 
     # q(x) = (p(x) - p(z)) / (x - z)
     diffQuotientPolyFr[].differenceQuotientEvalInDomain(
-      poly, zIndex, invRootsMinusZ[], domain)
+      poly, uint32 zIndex, invRootsMinusZ[], domain, isBitReversedDomain)
 
   freeHeapAligned(invRootsMinusZ)
 
@@ -242,7 +243,7 @@ func kzg_prove*[N: static int, C: static Curve](
   var proofJac {.noInit.}: ECP_ShortW_Jac[Fp[C], G1]
   proofJac.multiScalarMul_vartime(diffQuotientPolyBigInt[], powers_of_tau.evals)
   proof.affine(proofJac)
-  
+
   freeHeapAligned(diffQuotientPolyBigInt)
 
 
diff --git a/constantine/ethereum_eip4844_kzg_polynomial_commitments.nim b/constantine/ethereum_eip4844_kzg_polynomial_commitments.nim
index 4dc04619..4530aadc 100644
--- a/constantine/ethereum_eip4844_kzg_polynomial_commitments.nim
+++ b/constantine/ethereum_eip4844_kzg_polynomial_commitments.nim
@@ -314,7 +314,8 @@ func compute_kzg_proof*(
   kzg_prove(
     proof, y,
     poly[], ctx.domain,
-    z, ctx.srs_lagrange_g1)
+    z, ctx.srs_lagrange_g1,
+    bitreversedDomain = true)
 
   discard proof_bytes.serialize_g1_compressed(proof) # cannot fail
   y_bytes.marshal(y, bigEndian) # cannot fail
diff --git a/constantine/math/polynomials/polynomials.nim b/constantine/math/polynomials/polynomials.nim
index 29292d34..6e97881a 100644
--- a/constantine/math/polynomials/polynomials.nim
+++ b/constantine/math/polynomials/polynomials.nim
@@ -40,6 +40,11 @@ type
 
   PolyDomainEval*[N: static int, Field] = object
     ## Metadata for polynomial in Lagrange basis (evaluation form)
+    ##
+    ## Note on inverses
+    ##   1/ωⁱ (mod N) = ωⁿ⁻ⁱ (mod N)
+    ## Hence in canonical representation rootsOfUnity[(N-i) and (N-1)] contains the inverse of rootsOfUnity[i]
+    ## This translates into rootsOfUnity[brp((N-brp(i)) and (N-1))] when bit-reversal permuted
     rootsOfUnity*{.align: 64.}: array[N, Field]
     invMaxDegree*: Field
 
@@ -141,9 +146,10 @@ func differenceQuotientEvalOffDomain*[N: static int, Field](
 func differenceQuotientEvalInDomain*[N: static int, Field](
        r: var PolynomialEval[N, Field],
        poly: PolynomialEval[N, Field],
-       zIndex: int,
+       zIndex: uint32,
        invRootsMinusZ: array[N, Field],
-       domain: PolyDomainEval[N, Field]) =
+       domain: PolyDomainEval[N, Field],
+       isBitReversedDomain: static bool) =
   ## Compute r(x) = (p(x) - p(z)) / (x - z)
   ##
   ## for z = ωⁱ a power of a root of unity
@@ -153,9 +159,14 @@ func differenceQuotientEvalInDomain*[N: static int, Field](
   ##   - rootsOfUnity:    ωⁱ
   ##   - invRootsMinusZ:  1/(ωⁱ-z)
   ##   - zIndex:          the index of the root of unity power that matches z = ωⁱᵈˣ
+
+  static:
+    # For powers of 2: x mod N == x and (N-1)
+    doAssert N.isPowerOf2_vartime()
+
   r.evals[zIndex].setZero()
 
-  for i in 0 ..< N:
+  for i in 0'u32 ..< N:
     if i == zIndex:
       # https://dankradfeist.de/ethereum/2021/06/18/pcs-multiproofs.html
       # section "Dividing when one of the points is zero".
@@ -168,31 +179,32 @@ func differenceQuotientEvalInDomain*[N: static int, Field](
 
     # q'ᵢ = -qᵢ * ωⁱ/z
     # q'idx = ∑ q'ᵢ
-    # since z is a power of ω, ωⁱ/z = ωⁱ⁻ⁱᵈˣ
-    # However some protocols use bit-reversal permutation (brp) to store the ωⁱ
-    # Hence retrieving the data would require roots[brp((brp(i)-brp(index)) mod n)] for those
-    # But is this fast? There is no single instruction for reversing bits of an integer.
-    # and the reversal depends on N.
-    # - https://stackoverflow.com/questions/746171/efficient-algorithm-for-bit-reversal-from-msb-lsb-to-lsb-msb-in-c
-    # - https://stackoverflow.com/questions/52226858/bit-reversal-algorithm-by-rutkowska
-    # - https://www.hpl.hp.com/techreports/93/HPL-93-89.pdf
-    # - https://graphics.stanford.edu/~seander/bithacks.html#BitReverseObvious
-    # The C version from Stanford's bithacks need log₂(n) loop iterations
-    # A 254~255-bit multiplication takes 38 cycles, we need 3 brp so at most ~13 cycles per brp
-    # For small Ethereum KZG, n = 2¹² = 4096, we're already at the breaking point
-    # even if an iteration takes a single cycle with instruction-level parallelism
     var ri {.noinit.}: Field
-    ri.neg(r.evals[i])                                 # -qᵢ
-    ri *= domain.rootsOfUnity[(i+N-zIndex) and (N-1)]  # -qᵢ * ωⁱ/z (explanation at the bottom)
-    r.evals[zIndex] += ri                              # r[zIndex] = ∑ -qᵢ * ωⁱ/z
-
-    # ωⁱ/z computation detail
-    #  from ωⁿ = 1 and z = ωⁱᵈˣ
-    #  hence ωⁿ⁻ⁱᵈˣ = 1/z
-    #  Note if using bit-reversal permutation (BRP):
-    #    BRP maintains the relationship
-    #    that the inverse of ωⁱ is at position n-i (mod n) in the array of roots of unity
+    ri.neg(r.evals[i])                                  # -qᵢ
+    when isBitReversedDomain:
+      const logN = log2_vartime(uint32 N)
+      let invZidx = N - reverseBits(uint32 zIndex, logN)
+      let canonI = reverseBits(i, logN)
+      let idx = reverseBits((canonI + invZidx) and (N-1), logN)
+      ri *= domain.rootsOfUnity[idx]                    # -qᵢ * ωⁱ/z  (explanation at the bottom)
+    else:
+      ri *= domain.rootsOfUnity[(i+N-zIndex) and (N-1)] # -qᵢ * ωⁱ/z  (explanation at the bottom)
+    r.evals[zIndex] += ri                               # r[zIndex] = ∑ -qᵢ * ωⁱ/z
+
+    # * 1/z computation detail
+    #    from ωⁿ = 1 and z = ωⁱᵈˣ
+    #    hence ωⁿ⁻ⁱᵈˣ = 1/z
+    #    However our z may be in bit-reversal permuted
+    #
+    # * We want ωⁱ/z which translate to ωⁱ*ωⁿ⁻ⁱᵈˣ hence ωⁱ⁺ⁿ⁻ⁱᵈˣ
+    #   with the roots of unity being a cyclic group of order N so we compute i+N-zIndex (mod N)
+    #
+    #   However some protocols use bit-reversal permutation (brp) to store the ωⁱ
+    #   Hence retrieving the data requires roots[brp((brp(i)-n-brp(idx)) mod n)] for those (note: n = brp(n))
     #
-    # We want ωⁱ/z which translate to ωⁱ*ωⁿ⁻ⁱᵈˣ hence ωⁱ⁺ⁿ⁻ⁱᵈˣ
-    # with the roots of unity being a cyclic group of order N so we compute i+N-zIndex (mod N)
-    static: doAssert N.isPowerOf2_vartime()
+    #   For Ethereum:
+    #     A 254~255-bit multiplication takes 11ns / 38 cycles (Fr[BLS12-381]),
+    #     A brp with n = 2¹² = 4096 (for EIP4844) takes about 6ns
+    #   We could also cache either ωⁿ⁻ⁱ or a map i' = brp(n - brp(i))
+    #   in non-brp order but cache misses are expensive
+    #   and brp can benefits from instruction-level parallelism
diff --git a/tests/t_ethereum_eip4844_deneb_kzg.nim b/tests/t_ethereum_eip4844_deneb_kzg.nim
index a9b8d48b..2a6ff6a9 100644
--- a/tests/t_ethereum_eip4844_deneb_kzg.nim
+++ b/tests/t_ethereum_eip4844_deneb_kzg.nim
@@ -149,8 +149,8 @@ block:
     test "blob_to_kzg_commitment(dst: var array[48, byte], blob: ptr array[4096, byte])":
       ctx.test_blob_to_kzg_commitment()
 
-    # test "compute_kzg_proof(proof: var array[48, byte], y: var array[32, byte], blob: ptr array[4096, byte], z: array[32, byte])":
-    #   ctx.test_compute_kzg_proof()
+    test "compute_kzg_proof(proof: var array[48, byte], y: var array[32, byte], blob: ptr array[4096, byte], z: array[32, byte])":
+      ctx.test_compute_kzg_proof()
 
     test "verify_kzg_proof(commitment: array[48, byte], z, y: array[32, byte], proof: array[48, byte]) -> bool":
       ctx.test_verify_kzg_proof()
diff --git a/tests/t_ethereum_eip4844_deneb_kzg.nim.cfg b/tests/t_ethereum_eip4844_deneb_kzg.nim.cfg
new file mode 100644
index 00000000..b8c53144
--- /dev/null
+++ b/tests/t_ethereum_eip4844_deneb_kzg.nim.cfg
@@ -0,0 +1,2 @@
+# NimYAML requires ORC instead of ARC for memory management to deal with cycles
+--mm:orc