From be019309b3f7a250c5c599bcda98148f92a77dcb Mon Sep 17 00:00:00 2001 From: Hanno Becker Date: Tue, 10 Dec 2024 06:37:13 +0000 Subject: [PATCH] Remove redundant reduction from `unpack_pk()` and `unpack_sk()` The specifications for the polynomial base multiplication currently require one input to be bound by q in absolute value. To ensure this, the unpacking routines `unpack_pk()` and `unpack_sk()` currently explicitly reduce their output via `poly_reduce()`. Functionally, this reduction is not needed: - For the PK, the modulus-check already ensures that the unpacked PK will be unsigned canonical. - For the SK, there is no requirement in the standard to check that the SK is reduced, or reduce it after unpacking. - There is no risk of overflow in the base multiplication even if the inputs are bound by 4096 rather than q: - For the C base multiplication, the input bound before modular reduction is now 2 * 4096 * 2^15, where before it was 2 * q * 2^15 -- both is below 2^31 so we are fine. After reduction, the 3/2 * q bound is weakened to 2 * q (a much better bound would be possible, but is unneeded), which is still sufficient for up to K=4-fold accumulation since 8 * q < INT16_MAX. - For the AArch64 base multiplication, we are accumulating in 32-bit up to a single final reduction. As in the C case, each 32-bit polynomial multiplication is bound by 2 * 4096 * 2^15 = 2^28, so the accumulated value before reduction is < 2^30 < INT32_MAX. - For the x86_64 base multiplication, the reasoning is unchanged: We don't accumulate in 32-bit but conduct a Montgomery multiplication for every monomial in the product, giving an output bound by q per monomial. Up to 8 such are accumulated, giving a bound by 8 * q < INT16_MAX. This commit strengthens the specifications for all polynomial base multiplication routines - basemul_cached - poly_basemul_montgomery_cached - polyvec_basemul_acc_montgomery - polyvec_basemul_acc_montgomery_cached - matvec_mul The polynomial reductions in `check_pk` and `check_sk` can then be safely removed. Signed-off-by: Hanno Becker --- mlkem/indcpa.c | 21 ++++++++------------- mlkem/native/aarch64/polyvec_clean.S | 11 ++++------- mlkem/native/x86_64/basemul.S | 7 ++----- mlkem/ntt.c | 7 +++---- mlkem/ntt.h | 8 ++++---- mlkem/params.h | 1 + mlkem/poly.c | 6 +++--- mlkem/poly.h | 6 +++--- mlkem/polyvec.c | 8 ++++---- mlkem/polyvec.h | 13 ++++++------- mlkem/reduce.c | 2 +- mlkem/reduce.h | 10 +++++----- 12 files changed, 44 insertions(+), 56 deletions(-) diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c index 69551e7b9..c3513911b 100644 --- a/mlkem/indcpa.c +++ b/mlkem/indcpa.c @@ -60,14 +60,10 @@ static void unpack_pk(polyvec *pk, uint8_t seed[MLKEM_SYMBYTES], polyvec_frombytes(pk, packedpk); memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES); - /* - * TODO! We know from the modulus check that this will result in an - * unsigned canonical polynomial, but CBMC does not know it. We should - * weaken the specification of `unpack_pk()` and all depending functions - * to work with the weaker 4096-bound, so that the proofs go through - * without the need of this redundant call to polyvec_reduce(). - */ - polyvec_reduce(pk); + /* NOTE: If a modulus check was conducted on the PK, we know at this + * point that the coefficients of `pk` are unsigned canonical. The + * specifications and proofs, however, do _not_ assume this, and instead + * work with the easily provable bound by 4096. */ } /************************************************* @@ -91,15 +87,14 @@ static void pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], polyvec *sk) * Description: De-serialize the secret key; inverse of pack_sk * * Arguments: - polyvec *sk: pointer to output vector of polynomials (secret - *key) + * key) * - const uint8_t *packedsk: pointer to input serialized secret - *key + * key **************************************************/ static void unpack_sk(polyvec *sk, const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES]) { polyvec_frombytes(sk, packedsk); - polyvec_reduce(sk); } /************************************************* @@ -358,7 +353,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed) * * Arguments: - polyvec *out: Pointer to output polynomial vector * - polyvec a[MLKEM_K]: Input matrix. Must be in NTT domain - * and have coefficients of absolute value < MLKEM_Q. + * and have coefficients of absolute value < 4096. * - polyvec *v: Input polynomial vector. Must be in NTT domain. * - polyvec *vc: Mulcache for v, computed via * polyvec_mulcache_compute(). @@ -373,7 +368,7 @@ __contract__( requires(memory_no_alias(vc, sizeof(polyvec_mulcache))) requires(forall(int, k0, 0, MLKEM_K - 1, forall(int, k1, 0, MLKEM_K - 1, - array_abs_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))))) + array_abs_bound(a[k0].vec[k1].coeffs, 0, MLKEM_N - 1, UINT12_MAX)))) assigns(object_whole(out))) { int i; diff --git a/mlkem/native/aarch64/polyvec_clean.S b/mlkem/native/aarch64/polyvec_clean.S index 9504d3d1c..59f57d77a 100644 --- a/mlkem/native/aarch64/polyvec_clean.S +++ b/mlkem/native/aarch64/polyvec_clean.S @@ -32,8 +32,8 @@ // Computes products (a0*b0 + a0*b0t, a0*b1 + a1*b0) in 32-bit. // // Bounds: -// - Assume |a| < q, |b|, |bt| < NTT_BOUND -// - Result: < 2*q*NTT_BOUND +// - Assume |a| < 4096, +// - Result: < 2*4096*2^15 = 2^28 .macro pmull d, a, b smull \d\()0l.4s, \a\()0.4h, \b\()0.4h smull2 \d\()0h.4s, \a\()0.8h, \b\()0.8h @@ -252,11 +252,8 @@ _MLKEM_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_clean): // Bounds: // - // Each pmull is bound by 2*q*2^15, so the final value - // before Montgomery reduction is bound by q*2^18 ~ 2^29.7 - // - // The Montgomery reduction is thus bound by - // q * (2^18/2^16 + 1/2) = 4.5 q < 14981 + // Each pmull is bound by 2*4096*2^15=2^28, so the final value + // before Montgomery reduction is bound by 2^30. mov count, #(MLKEM_N / 16) k4_loop_start: diff --git a/mlkem/native/x86_64/basemul.S b/mlkem/native/x86_64/basemul.S index 0c52a2365..797b0f46a 100644 --- a/mlkem/native/x86_64/basemul.S +++ b/mlkem/native/x86_64/basemul.S @@ -104,11 +104,8 @@ vpaddw %ymm7,%ymm11,%ymm11 vpsubw %ymm13,%ymm10,%ymm13 vpsubw %ymm12,%ymm6,%ymm6 -/* Bounds: Note that we assume that a+b*X is coefficient-wise bound by q in - * in absolute value: This is coming from the contract for - * polyvec_basemul_acc_montgomery_cached(). - * - * Then, each Montgomery multiplication has absolute value < q, +/* Bounds: Since we are multiplying with signed canonical twiddles, + * each Montgomery multiplication has absolute value < q, * and hence the coefficients of the output have absolute value < 2q. */ vmovdqa %ymm13,(64*\off+ 0)*2(%rdi) vmovdqa %ymm9,(64*\off+16)*2(%rdi) diff --git a/mlkem/ntt.c b/mlkem/ntt.c index e1a9aacf4..5b17ad0ce 100644 --- a/mlkem/ntt.c +++ b/mlkem/ntt.c @@ -59,7 +59,7 @@ __contract__( for (j = start; j < start + len; j++) __loop__( invariant(start <= j && j <= start + len) - /* + /* * Coefficients are updated in strided pairs, so the bounds for the * intermediate states alternate twice between the old and new bound */ @@ -247,7 +247,7 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], { int32_t t0, t1; - BOUND(a, 2, MLKEM_Q, "basemul input bound"); + BOUND(a, 2, 4096, "basemul input bound"); t0 = (int32_t)a[1] * b_cached; t0 += (int32_t)a[0] * b[0]; @@ -258,6 +258,5 @@ void basemul_cached(int16_t r[2], const int16_t a[2], const int16_t b[2], r[0] = montgomery_reduce(t0); r[1] = montgomery_reduce(t1); - /* |r[i]| < 3/2 q */ - BOUND(r, 2, 3 * MLKEM_Q / 2, "basemul output bound"); + BOUND(r, 2, 2 * MLKEM_Q, "basemul output bound"); } diff --git a/mlkem/ntt.h b/mlkem/ntt.h index 558c1d99f..68c9e6724 100644 --- a/mlkem/ntt.h +++ b/mlkem/ntt.h @@ -78,9 +78,9 @@ __contract__( * * Arguments: - r: Pointer to output polynomial * Upon return, coefficients are bound by - * 3*(q+1)/2 in absolute value. + * 2*MLKEM_Q in absolute value. * - a: Pointer to first input polynomial - * Must be coefficient-wise < q in absolute value. + * Must be coefficient-wise < 4096 in absolute value. * - b: Pointer to second input polynomial * Can have arbitrary int16_t coefficients * - b_cached: Some precomputed value, typically derived from @@ -92,9 +92,9 @@ __contract__( requires(memory_no_alias(r, 2 * sizeof(int16_t))) requires(memory_no_alias(a, 2 * sizeof(int16_t))) requires(memory_no_alias(b, 2 * sizeof(int16_t))) - requires(array_abs_bound(a, 0, 1, MLKEM_Q - 1)) + requires(array_abs_bound(a, 0, 1, UINT12_MAX)) assigns(memory_slice(r, 2 * sizeof(int16_t))) - ensures(array_abs_bound(r, 0, 1, (3 * HALF_Q - 1))) + ensures(array_abs_bound(r, 0, 1, 2 * MLKEM_Q - 1)) ); diff --git a/mlkem/params.h b/mlkem/params.h index ab7beb7e1..cd76a7d67 100644 --- a/mlkem/params.h +++ b/mlkem/params.h @@ -28,6 +28,7 @@ #define MLKEM_N 256 #define MLKEM_Q 3329 +#define UINT12_MAX 4095 #define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define MLKEM_SSBYTES 32 /* size in bytes of shared key */ diff --git a/mlkem/poly.c b/mlkem/poly.c index 93a663c12..b4e50420c 100644 --- a/mlkem/poly.c +++ b/mlkem/poly.c @@ -311,7 +311,7 @@ void poly_frombytes(poly *r, const uint8_t a[MLKEM_POLYBYTES]) for (i = 0; i < MLKEM_N / 2; i++) __loop__( invariant(i >= 0 && i <= MLKEM_N / 2) - invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, 4095))) + invariant(array_bound(r->coeffs, 0, (2 * i - 1), 0, UINT12_MAX))) { /* REF-CHANGE: Introduce some locals for better readability */ const uint8_t t0 = a[3 * i + 0]; @@ -456,13 +456,13 @@ void poly_basemul_montgomery_cached(poly *r, const poly *a, const poly *b, const poly_mulcache *b_cache) { int i; - POLY_BOUND(b_cache, MLKEM_Q); + POLY_BOUND(b_cache, 4096); for (i = 0; i < MLKEM_N / 4; i++) __loop__( assigns(i, object_whole(r)) invariant(i >= 0 && i <= MLKEM_N / 4) - invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), (3 * HALF_Q - 1)))) + invariant(array_abs_bound(r->coeffs, 0, (4 * i - 1), 2 * MLKEM_Q - 1))) { basemul_cached(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], b_cache->coeffs[2 * i]); diff --git a/mlkem/poly.h b/mlkem/poly.h index 35990684b..772afefe0 100644 --- a/mlkem/poly.h +++ b/mlkem/poly.h @@ -439,7 +439,7 @@ __contract__( requires(memory_no_alias(a, MLKEM_POLYBYTES)) requires(memory_no_alias(r, sizeof(poly))) assigns(memory_slice(r, sizeof(poly))) - ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, 4095)) + ensures(array_bound(r->coeffs, 0, (MLKEM_N - 1), 0, UINT12_MAX)) ); @@ -637,9 +637,9 @@ __contract__( requires(memory_no_alias(a, sizeof(poly))) requires(memory_no_alias(b, sizeof(poly))) requires(memory_no_alias(b_cache, sizeof(poly_mulcache))) - requires(array_abs_bound(a->coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1))) + requires(array_abs_bound(a->coeffs, 0, MLKEM_N - 1, UINT12_MAX)) assigns(object_whole(r)) - ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, (3 * HALF_Q - 1))) + ensures(array_abs_bound(r->coeffs, 0, MLKEM_N - 1, 2 * MLKEM_Q - 1)) ); #define poly_tomont MLKEM_NAMESPACE(poly_tomont) diff --git a/mlkem/polyvec.c b/mlkem/polyvec.c index b658f8544..7f268ee50 100644 --- a/mlkem/polyvec.c +++ b/mlkem/polyvec.c @@ -79,7 +79,7 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, int i; poly t; - POLYVEC_BOUND(a, MLKEM_Q); + POLYVEC_BOUND(a, 4096); POLYVEC_BOUND(b, NTT_BOUND); POLYVEC_BOUND(b_cache, MLKEM_Q); @@ -98,17 +98,17 @@ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, * them from the spec to not unnecessarily constraint native implementations. */ cassert( - array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_K * (3 * HALF_Q - 1)), + array_abs_bound(r->coeffs, 0, MLKEM_N - 1, MLKEM_K * (2 * MLKEM_Q - 1)), "polyvec_basemul_acc_montgomery_cached output bounds"); /* TODO: Integrate CBMC assertion into POLY_BOUND if CBMC is set */ - POLY_BOUND(r, MLKEM_K * 3 * HALF_Q); + POLY_BOUND(r, MLKEM_K * 2 * MLKEM_Q); } #else /* !MLKEM_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */ void polyvec_basemul_acc_montgomery_cached(poly *r, const polyvec *a, const polyvec *b, const polyvec_mulcache *b_cache) { - POLYVEC_BOUND(a, MLKEM_Q); + POLYVEC_BOUND(a, 4096); POLYVEC_BOUND(b, NTT_BOUND); /* Omitting POLYVEC_BOUND(b_cache, MLKEM_Q) since native implementations may * decide not to use a mulcache. Note that the C backend implementation diff --git a/mlkem/polyvec.h b/mlkem/polyvec.h index 41753ff75..8c3f1462d 100644 --- a/mlkem/polyvec.h +++ b/mlkem/polyvec.h @@ -93,7 +93,7 @@ __contract__( * * Arguments: - const polyvec *a: pointer to output vector of polynomials * (of length MLKEM_POLYVECBYTES). Output will have coefficients - * normalized to [0,..,q-1]. + * normalized in [0..4095]. * - uint8_t *r: pointer to input byte array **************************************************/ void polyvec_frombytes(polyvec *r, const uint8_t a[MLKEM_POLYVECBYTES]) @@ -102,7 +102,7 @@ __contract__( requires(memory_no_alias(a, MLKEM_POLYVECBYTES)) assigns(object_whole(r)) ensures(forall(int, k0, 0, MLKEM_K - 1, - array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, 4095))) + array_bound(r->vec[k0].coeffs, 0, (MLKEM_N - 1), 0, UINT12_MAX))) ); #define polyvec_ntt MLKEM_NAMESPACE(polyvec_ntt) @@ -171,9 +171,9 @@ __contract__( requires(memory_no_alias(r, sizeof(poly))) requires(memory_no_alias(a, sizeof(polyvec))) requires(memory_no_alias(b, sizeof(polyvec))) -/* Input is coefficient-wise < q in absolute value */ + /* Input is coefficient-wise < q in absolute value */ requires(forall(int, k1, 0, MLKEM_K - 1, - array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))) + array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, UINT12_MAX))) assigns(memory_slice(r, sizeof(poly))) ); @@ -188,7 +188,7 @@ __contract__( * using mulcache for second operand. * * Bounds: - * - a is assumed to be coefficient-wise < q in absolute value. + * - a is assumed to be coefficient-wise < 4096 in absolute value. * - No bounds guarantees for the coefficients in the result. * * Arguments: - poly *r: pointer to output polynomial @@ -206,9 +206,8 @@ __contract__( requires(memory_no_alias(a, sizeof(polyvec))) requires(memory_no_alias(b, sizeof(polyvec))) requires(memory_no_alias(b_cache, sizeof(polyvec_mulcache))) -/* Input is coefficient-wise < q in absolute value */ requires(forall(int, k1, 0, MLKEM_K - 1, - array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, (MLKEM_Q - 1)))) + array_abs_bound(a->vec[k1].coeffs, 0, MLKEM_N - 1, UINT12_MAX))) assigns(memory_slice(r, sizeof(poly))) ); diff --git a/mlkem/reduce.c b/mlkem/reduce.c index 771dd7ee2..073dcb97e 100644 --- a/mlkem/reduce.c +++ b/mlkem/reduce.c @@ -97,7 +97,7 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a) int16_t montgomery_reduce(int32_t a) { int16_t res; - SCALAR_BOUND(a, 2 * MLKEM_Q * 32768, "montgomery_reduce input"); + SCALAR_BOUND(a, 2 * UINT12_MAX * 32768, "montgomery_reduce input"); res = montgomery_reduce_generic(a); diff --git a/mlkem/reduce.h b/mlkem/reduce.h index 24ade7343..0bb10a8a1 100644 --- a/mlkem/reduce.h +++ b/mlkem/reduce.h @@ -20,16 +20,16 @@ * Description: Montgomery reduction * * Arguments: - int32_t a: input integer to be reduced - * Must be smaller than 2 * q * 2^15 in absolute value. + * Must be smaller than 2 * 2^12 * 2^15 in absolute value. * * Returns: integer congruent to a * R^-1 modulo q, - * smaller than 3/2 q in absolute value. + * smaller than 2 * q in absolute value. **************************************************/ int16_t montgomery_reduce(int32_t a) __contract__( - requires(a > -(2 * MLKEM_Q * 32768)) - requires(a < (2 * MLKEM_Q * 32768)) - ensures(return_value > -(3 * HALF_Q) && return_value < (3 * HALF_Q)) + requires(a > -(2 * 4096 * 32768)) + requires(a < (2 * 4096 * 32768)) + ensures(return_value > -2 * MLKEM_Q && return_value < 2 * MLKEM_Q) ); #define barrett_reduce MLKEM_NAMESPACE(barrett_reduce)