From 6bc8ae3bc2d3020152e2a3c8fcea919892547dfb Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 8 Nov 2024 04:09:07 +0000
Subject: [PATCH 1/5] KeyGen: Avoid passing an array of poly pointers

The A-matrix generation routine gen_matrix() first generates 4
matrix entries a time, and then does one more, if necessary.
(For MLKEM_K=2 and MLKEM_K=4, the number of matrix entries is
a multiple of 4, while for MLKEM_K=3, we need 9=2*4+1 matrix
entries).

The function taking care of generating 4 entries a time is
gen_matrix_entry_x4(). Previously, this function was rather
generic in that it took an array of 4 pointers to polynomials
as input, allowing them to be scattered in memory.

This generality is both unneeded and problematic for verification:
- It's unneeded, because we always generate a _slice_ of the matrix
  at a time -- that is, the pointers passed are always consecutive.
- It's problematic for verification in CBMC, since I am unaware of
  a way to specify a precondition saying that the 4 given polynomials
  may be part of the same parent object, but must be disjoint in it.

This commit modifies the signature of gen_matrix_entry_x4() to just
pass the address of the sizeof(poly)*4-sized slice of the matrix to
be generated as an argument. This slightly simplifies the caller, since
we don't need to prepare an array of pointers. It also removes the
issue for CBMC verification, since now gen_matrix_entry_x4() only
needs to specify a single validity condition for the entire slice.

Signed-off-by: Hanno Becker <beckphan@amazon.co.uk>
---
 mlkem/indcpa.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
index 6b19a8cd3..e0a822875 100644
--- a/mlkem/indcpa.c
+++ b/mlkem/indcpa.c
@@ -147,8 +147,7 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 
 // Generate four A matrix entries from a seed, using rejection
 // sampling on the output of a XOF.
-static void gen_matrix_entry_x4(poly *vec[4],
-                                uint8_t seed[4][MLKEM_SYMBYTES + 16]) {
+static void gen_matrix_entry_x4(poly vec[4], uint8_t seed[4][MLKEM_SYMBYTES + 16]) {
   // Temporary buffers for XOF output before rejection sampling
   uint8_t bufx[KECCAK_WAY][GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
   // Tracks the number of coefficients we have already sampled
@@ -166,7 +165,7 @@ static void gen_matrix_entry_x4(poly *vec[4],
                            GEN_MATRIX_NBLOCKS, &statex);
   buflen = GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
   for (unsigned int j = 0; j < KECCAK_WAY; j++) {
-    ctr[j] = rej_uniform(vec[j]->coeffs, MLKEM_N, 0, bufx[j], buflen);
+    ctr[j] = rej_uniform(vec[j].coeffs, MLKEM_N, 0, bufx[j], buflen);
   }
 
   // So long as not all matrix entries have been generated, squeeze
@@ -176,7 +175,7 @@ static void gen_matrix_entry_x4(poly *vec[4],
          ctr[3] < MLKEM_N) {
     shake128x4_squeezeblocks(bufx[0], bufx[1], bufx[2], bufx[3], 1, &statex);
     for (unsigned j = 0; j < KECCAK_WAY; j++) {
-      ctr[j] = rej_uniform(vec[j]->coeffs, MLKEM_N, ctr[j], bufx[j], buflen);
+      ctr[j] = rej_uniform(vec[j].coeffs, MLKEM_N, ctr[j], bufx[j], buflen);
     }
   }
   shake128x4_ctx_release(&statex);
@@ -250,7 +249,6 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
   for (i = 0; i < (MLKEM_K * MLKEM_K / KECCAK_WAY) * KECCAK_WAY;
        i += KECCAK_WAY) {
     uint8_t x, y;
-    poly *vec[4];
 
     for (unsigned int j = 0; j < KECCAK_WAY; j++) {
       x = (i + j) / MLKEM_K;
@@ -262,10 +260,9 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
         seedxy[j][MLKEM_SYMBYTES + 0] = y;
         seedxy[j][MLKEM_SYMBYTES + 1] = x;
       }
-      vec[j] = &a[x].vec[y];
     }
 
-    gen_matrix_entry_x4(vec, seedxy);
+    gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
   }
 
   // For left over vector, we use single keccak.
@@ -282,7 +279,7 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
       seedxy[0][MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[x].vec[y], seedxy[0]);
+    gen_matrix_entry(&a[0].vec[0] + i, seedxy[0]);
   }
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)

From 43327024a679c64dec7619c01b7e1d656e755939 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 8 Nov 2024 04:18:30 +0000
Subject: [PATCH 2/5] KeyGen: Use separate stack-allocated seed buffers

Currently, the A-generation routine gen_matrix() uses
a single 2D seed array to manage the 4 seed buffers needed
for the batched generation of 4 matrix entries at a time.

Further down the call stack, slices of that same seed buffer
are passed to the shake256x4-API for absorbing. As in the
previous commit, this causes issues with verification, since
CBMC does not support -- or, at least I am unaware of --
specifying disjointness of slices within the same object.

This commit avoids the issue by allocating separate seed
buffers on the stack, rather than a single 2D array.
This also allows to avoid the manual padding of the seed
buffer, and use standard alignment constraints instead.

Signed-off-by: Hanno Becker <beckphan@amazon.co.uk>
---
 mlkem/indcpa.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
index e0a822875..923c2426b 100644
--- a/mlkem/indcpa.c
+++ b/mlkem/indcpa.c
@@ -147,7 +147,7 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 
 // Generate four A matrix entries from a seed, using rejection
 // sampling on the output of a XOF.
-static void gen_matrix_entry_x4(poly vec[4], uint8_t seed[4][MLKEM_SYMBYTES + 16]) {
+static void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4]) {
   // Temporary buffers for XOF output before rejection sampling
   uint8_t bufx[KECCAK_WAY][GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
   // Tracks the number of coefficients we have already sampled
@@ -185,9 +185,9 @@ static void gen_matrix_entry_x4(poly vec[4], uint8_t seed[4][MLKEM_SYMBYTES + 16
 // sampling on the output of a XOF.
 STATIC_TESTABLE
 void gen_matrix_entry(poly *entry,
-                      uint8_t seed[MLKEM_SYMBYTES + 16])  // clang-format off
+                      uint8_t seed[MLKEM_SYMBYTES + 2])  // clang-format off
   REQUIRES(IS_FRESH(entry, sizeof(poly)))
-  REQUIRES(IS_FRESH(seed, MLKEM_SYMBYTES + 16))
+  REQUIRES(IS_FRESH(seed, MLKEM_SYMBYTES + 2))
   ASSIGNS(OBJECT_UPTO(entry, sizeof(poly)))
   ENSURES(ARRAY_IN_BOUNDS(int, k, 0, MLKEM_N - 1, entry->coeffs, 0, (MLKEM_Q - 1)))
 {  // clang-format on
@@ -195,7 +195,6 @@ void gen_matrix_entry(poly *entry,
   uint8_t buf[GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
   unsigned int ctr, buflen;
 
-  // seed is MLKEM_SYMBYTES + 2 bytes long, but padded to MLKEM_SYMBYTES + 16
   shake128_absorb(&state, seed, MLKEM_SYMBYTES + 2);
 
   // Initially, squeeze + sample heuristic number of GEN_MATRIX_NBLOCKS.
@@ -235,8 +234,15 @@ void gen_matrix_entry(poly *entry,
 void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
                 int transposed) {
   int i;
-  // We need MLKEM_SYMBYTES + 2 bytes per seed, but add padding for alignment
-  uint8_t seedxy[KECCAK_WAY][MLKEM_SYMBYTES + 16];
+  // We generate four separate seed arrays rather than a single one to work
+  // around limitations in CBMC function contracts dealing with disjoint slices
+  // of the same parent object.
+  uint8_t seed0[MLKEM_SYMBYTES + 2] ALIGN;
+  uint8_t seed1[MLKEM_SYMBYTES + 2] ALIGN;
+  uint8_t seed2[MLKEM_SYMBYTES + 2] ALIGN;
+  uint8_t seed3[MLKEM_SYMBYTES + 2] ALIGN;
+  uint8_t *seedxy[] = {seed0, seed1, seed2, seed3};
+
   for (unsigned j = 0; j < KECCAK_WAY; j++) {
     memcpy(seedxy[j], seed, MLKEM_SYMBYTES);
   }
@@ -272,14 +278,14 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
     y = i % MLKEM_K;
 
     if (transposed) {
-      seedxy[0][MLKEM_SYMBYTES + 0] = x;
-      seedxy[0][MLKEM_SYMBYTES + 1] = y;
+      seed0[MLKEM_SYMBYTES + 0] = x;
+      seed0[MLKEM_SYMBYTES + 1] = y;
     } else {
-      seedxy[0][MLKEM_SYMBYTES + 0] = y;
-      seedxy[0][MLKEM_SYMBYTES + 1] = x;
+      seed0[MLKEM_SYMBYTES + 0] = y;
+      seed0[MLKEM_SYMBYTES + 1] = x;
     }
 
-    gen_matrix_entry(&a[0].vec[0] + i, seedxy[0]);
+    gen_matrix_entry(&a[0].vec[0] + i, seed0);
   }
 
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)

From cbb05654996209255a10f7c439a835fc72326053 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 8 Nov 2024 05:56:18 +0000
Subject: [PATCH 3/5] CBMC: Add spec + proof for gen_matrix_entry_x4

This commit adds a CBMC sped + proof for gen_matrix_entry_x4(),
the part of the A-matrix expanion which generates a slice of four
matrix entries at a time.

While the core functions rej_uniform and shake128x4_xxx are
axiomatized or specified, the proof still proved to be rather
fiddy to get through.
- One has to be careful to spell out the 0,1,2,3 cases, rather than
  using static loops. I don't know why, but somehow the proof gets
  much slower otherwise (in fact, I don't know if it'll terminate
  in acceptable time).
- As in the previous commit, one has to split the temporary sampling
  buffer from a 2D array into four separate stack buffers, to avoid
  (seeming) limitations around pointers to the same object in CBMC.

Signed-off-by: Hanno Becker <beckphan@amazon.co.uk>
---
 cbmc/proofs/gen_matrix_entry_x4/Makefile      | 54 ++++++++++++++++++
 .../proofs/gen_matrix_entry_x4/cbmc-proof.txt |  3 +
 .../gen_matrix_entry_x4_harness.c             | 32 +++++++++++
 fips202/fips202x4.h                           | 25 ++++++++-
 mlkem/indcpa.c                                | 55 +++++++++++++++----
 5 files changed, 155 insertions(+), 14 deletions(-)
 create mode 100644 cbmc/proofs/gen_matrix_entry_x4/Makefile
 create mode 100644 cbmc/proofs/gen_matrix_entry_x4/cbmc-proof.txt
 create mode 100644 cbmc/proofs/gen_matrix_entry_x4/gen_matrix_entry_x4_harness.c

diff --git a/cbmc/proofs/gen_matrix_entry_x4/Makefile b/cbmc/proofs/gen_matrix_entry_x4/Makefile
new file mode 100644
index 000000000..436187c21
--- /dev/null
+++ b/cbmc/proofs/gen_matrix_entry_x4/Makefile
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = gen_matrix_entry_x4_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = gen_matrix_entry_x4
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET +=
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/indcpa.c $(SRCDIR)/fips202/fips202x4.c
+
+CHECK_FUNCTION_CONTRACTS= gen_matrix_entry_x4
+USE_FUNCTION_CONTRACTS=$(FIPS202_NAMESPACE)shake128x4_absorb $(FIPS202_NAMESPACE)shake128x4_squeezeblocks $(MLKEM_NAMESPACE)rej_uniform
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--smt2
+
+FUNCTION_NAME = $(MLKEM_NAMESPACE)gen_matrix_entry_x4
+
+# If this proof is found to consume huge amounts of RAM, you can set the
+# EXPENSIVE variable. With new enough versions of the proof tools, this will
+# restrict the number of EXPENSIVE CBMC jobs running at once. See the
+# documentation in Makefile.common under the "Job Pools" heading for details.
+# EXPENSIVE = true
+
+# This function is large enough to need...
+CBMC_OBJECT_BITS = 10
+
+# If you require access to a file-local ("static") function or object to conduct
+# your proof, set the following (and do not include the original source file
+# ("mlkem/poly.c") in PROJECT_SOURCES).
+# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i
+# include ../Makefile.common
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mlkem/poly.c
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz
+# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must
+# be set before including Makefile.common, but any use of variables on the
+# left-hand side requires those variables to be defined. Hence, _SOURCE,
+# _FUNCTIONS, _OBJECTS is set after including Makefile.common.
+
+include ../Makefile.common
diff --git a/cbmc/proofs/gen_matrix_entry_x4/cbmc-proof.txt b/cbmc/proofs/gen_matrix_entry_x4/cbmc-proof.txt
new file mode 100644
index 000000000..3d1913ebf
--- /dev/null
+++ b/cbmc/proofs/gen_matrix_entry_x4/cbmc-proof.txt
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# This file marks this directory as containing a CBMC proof.
diff --git a/cbmc/proofs/gen_matrix_entry_x4/gen_matrix_entry_x4_harness.c b/cbmc/proofs/gen_matrix_entry_x4/gen_matrix_entry_x4_harness.c
new file mode 100644
index 000000000..b959ebf3d
--- /dev/null
+++ b/cbmc/proofs/gen_matrix_entry_x4/gen_matrix_entry_x4_harness.c
@@ -0,0 +1,32 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+/*
+ * Insert copyright notice
+ */
+
+/**
+ * @file gen_matrix_entry_x4_harness.c
+ * @brief Implements the proof harness for gen_matrix_entry_x4 function.
+ */
+
+/*
+ * Insert project header files that
+ *   - include the declaration of the function
+ *   - include the types needed to declare function arguments
+ */
+#include <stdint.h>
+#include "poly.h"
+
+// declare here since it's static in non-CBMC builds
+void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4]);
+
+/**
+ * @brief Starting point for formal analysis
+ *
+ */
+void harness(void) {
+  poly out[4];
+  uint8_t *seed[4];
+  gen_matrix_entry_x4(out, seed);
+}
diff --git a/fips202/fips202x4.h b/fips202/fips202x4.h
index f0bd3a8a4..9abc05ff8 100644
--- a/fips202/fips202x4.h
+++ b/fips202/fips202x4.h
@@ -4,6 +4,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include "fips202.h"
 #include "keccakf1600.h"
 #include "namespace.h"
 
@@ -14,15 +15,35 @@ void shake128x4_absorb(keccakx4_state *state, const uint8_t *in0,
                        const uint8_t *in1, const uint8_t *in2,
                        const uint8_t *in3, size_t inlen);
 
+// clang-format off
 #define shake256x4_absorb FIPS202_NAMESPACE(shake256x4_absorb)
 void shake256x4_absorb(keccakx4_state *state, const uint8_t *in0,
                        const uint8_t *in1, const uint8_t *in2,
-                       const uint8_t *in3, size_t inlen);
+                       const uint8_t *in3, size_t inlen)
+REQUIRES(IS_FRESH(state, sizeof(keccakx4_state)))
+REQUIRES(IS_FRESH(in0, inlen))
+REQUIRES(IS_FRESH(in1, inlen))
+REQUIRES(IS_FRESH(in2, inlen))
+REQUIRES(IS_FRESH(in3, inlen))
+ASSIGNS(OBJECT_WHOLE(state));
+// clang-format on
 
+// clang-format off
 #define shake128x4_squeezeblocks FIPS202_NAMESPACE(shake128x4_squeezeblocks)
 void shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2,
                               uint8_t *out3, size_t nblocks,
-                              keccakx4_state *state);
+                              keccakx4_state *state)
+REQUIRES(IS_FRESH(state, sizeof(keccakx4_state)))
+REQUIRES(IS_FRESH(out0, nblocks * SHAKE128_RATE))
+REQUIRES(IS_FRESH(out1, nblocks * SHAKE128_RATE))
+REQUIRES(IS_FRESH(out2, nblocks * SHAKE128_RATE))
+REQUIRES(IS_FRESH(out3, nblocks * SHAKE128_RATE))
+ASSIGNS(OBJECT_UPTO(out0, nblocks * SHAKE128_RATE),
+  OBJECT_UPTO(out1, nblocks * SHAKE128_RATE),
+  OBJECT_UPTO(out2, nblocks * SHAKE128_RATE),
+  OBJECT_UPTO(out3, nblocks * SHAKE128_RATE),
+  OBJECT_WHOLE(state));
+// clang-format on
 
 #define shake256x4_squeezeblocks FIPS202_NAMESPACE(shake256x4_squeezeblocks)
 void shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2,
diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
index 923c2426b..ddff6726b 100644
--- a/mlkem/indcpa.c
+++ b/mlkem/indcpa.c
@@ -147,9 +147,28 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 
 // Generate four A matrix entries from a seed, using rejection
 // sampling on the output of a XOF.
-static void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4]) {
+// clang-format off
+STATIC_TESTABLE
+void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4])
+  REQUIRES(IS_FRESH(vec, sizeof(poly) * 4))
+  REQUIRES(IS_FRESH(seed, sizeof(uint8_t*) * 4))
+  REQUIRES(IS_FRESH(seed[0], MLKEM_SYMBYTES + 2))
+  REQUIRES(IS_FRESH(seed[1], MLKEM_SYMBYTES + 2))
+  REQUIRES(IS_FRESH(seed[2], MLKEM_SYMBYTES + 2))
+  REQUIRES(IS_FRESH(seed[3], MLKEM_SYMBYTES + 2))
+  ASSIGNS(OBJECT_UPTO(vec, sizeof(poly) * 4))
+  ENSURES(ARRAY_IN_BOUNDS(int, k0, 0, MLKEM_N - 1, vec[0].coeffs, 0, (MLKEM_Q - 1)))
+  ENSURES(ARRAY_IN_BOUNDS(int, k1, 0, MLKEM_N - 1, vec[1].coeffs, 0, (MLKEM_Q - 1)))
+  ENSURES(ARRAY_IN_BOUNDS(int, k2, 0, MLKEM_N - 1, vec[2].coeffs, 0, (MLKEM_Q - 1)))
+  ENSURES(ARRAY_IN_BOUNDS(int, k3, 0, MLKEM_N - 1, vec[3].coeffs, 0, (MLKEM_Q - 1)))
+// clang-format on
+{
   // Temporary buffers for XOF output before rejection sampling
-  uint8_t bufx[KECCAK_WAY][GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf0[GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf1[GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf2[GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+  uint8_t buf3[GEN_MATRIX_NBLOCKS * SHAKE128_RATE];
+
   // Tracks the number of coefficients we have already sampled
   unsigned int ctr[KECCAK_WAY];
   keccakx4_state statex;
@@ -161,23 +180,35 @@ static void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4]) {
 
   // Initially, squeeze heuristic number of GEN_MATRIX_NBLOCKS.
   // This should generate the matrix entries with high probability.
-  shake128x4_squeezeblocks(bufx[0], bufx[1], bufx[2], bufx[3],
-                           GEN_MATRIX_NBLOCKS, &statex);
+  shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, GEN_MATRIX_NBLOCKS, &statex);
   buflen = GEN_MATRIX_NBLOCKS * SHAKE128_RATE;
-  for (unsigned int j = 0; j < KECCAK_WAY; j++) {
-    ctr[j] = rej_uniform(vec[j].coeffs, MLKEM_N, 0, bufx[j], buflen);
-  }
+  ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, 0, buf0, buflen);
+  ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, 0, buf1, buflen);
+  ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, 0, buf2, buflen);
+  ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, 0, buf3, buflen);
 
   // So long as not all matrix entries have been generated, squeeze
   // one more block a time until we're done.
   buflen = SHAKE128_RATE;
   while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
-         ctr[3] < MLKEM_N) {
-    shake128x4_squeezeblocks(bufx[0], bufx[1], bufx[2], bufx[3], 1, &statex);
-    for (unsigned j = 0; j < KECCAK_WAY; j++) {
-      ctr[j] = rej_uniform(vec[j].coeffs, MLKEM_N, ctr[j], bufx[j], buflen);
+         ctr[3] < MLKEM_N)  // clang-format off
+    ASSIGNS(ctr, statex, OBJECT_UPTO(vec, sizeof(poly) * 4), OBJECT_WHOLE(buf0),
+       OBJECT_WHOLE(buf1), OBJECT_WHOLE(buf2), OBJECT_WHOLE(buf3))
+    INVARIANT(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
+    INVARIANT(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
+    INVARIANT(ctr[0] > 0 ==> ARRAY_IN_BOUNDS(int, k0, 0, ctr[0] - 1, vec[0].coeffs, 0, (MLKEM_Q - 1)))
+    INVARIANT(ctr[1] > 0 ==> ARRAY_IN_BOUNDS(int, k1, 0, ctr[1] - 1, vec[1].coeffs, 0, (MLKEM_Q - 1)))
+    INVARIANT(ctr[2] > 0 ==> ARRAY_IN_BOUNDS(int, k2, 0, ctr[2] - 1, vec[2].coeffs, 0, (MLKEM_Q - 1)))
+    INVARIANT(ctr[3] > 0 ==> ARRAY_IN_BOUNDS(int, k3, 0, ctr[3] - 1, vec[3].coeffs, 0, (MLKEM_Q - 1)))
+                            // clang-format on
+    {
+      shake128x4_squeezeblocks(buf0, buf1, buf2, buf3, 1, &statex);
+      ctr[0] = rej_uniform(vec[0].coeffs, MLKEM_N, ctr[0], buf0, buflen);
+      ctr[1] = rej_uniform(vec[1].coeffs, MLKEM_N, ctr[1], buf1, buflen);
+      ctr[2] = rej_uniform(vec[2].coeffs, MLKEM_N, ctr[2], buf2, buflen);
+      ctr[3] = rej_uniform(vec[3].coeffs, MLKEM_N, ctr[3], buf3, buflen);
     }
-  }
+
   shake128x4_ctx_release(&statex);
 }
 

From c5b619ddc9633258e2ca8d489ffe76abdbe8f982 Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 8 Nov 2024 06:13:29 +0000
Subject: [PATCH 4/5] KeyGen: Remove size annotation from gen_matrix_entry_x4()

gen_matrix_entry_x4() expects a slice of 4 polynomials to be filled.
In the caller gen_matrix(), this slice is constructed as a pointer
into an array of MLKEM_K polyvecs. Importantly, however, the slice of
4 poly's passed to gen_matrix_entry_x4() can cross the boundary
between two polyvec instances. Some compilers notice that from
the signature of gen_matrix_entry_x4(), and fail.

The behaviour of the code is intentional and safe here. To suppress
the compiler warning, this commit changes the signature of
gen_matrix_entry_x4() to use a generic poly* pointer, rather than
a pointer to a poly[4].

Signed-off-by: Hanno Becker <beckphan@amazon.co.uk>
---
 mlkem/indcpa.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
index ddff6726b..db066d684 100644
--- a/mlkem/indcpa.c
+++ b/mlkem/indcpa.c
@@ -149,7 +149,7 @@ static void unpack_ciphertext(polyvec *b, poly *v,
 // sampling on the output of a XOF.
 // clang-format off
 STATIC_TESTABLE
-void gen_matrix_entry_x4(poly vec[4], uint8_t *seed[4])
+void gen_matrix_entry_x4(poly *vec, uint8_t *seed[4])
   REQUIRES(IS_FRESH(vec, sizeof(poly) * 4))
   REQUIRES(IS_FRESH(seed, sizeof(uint8_t*) * 4))
   REQUIRES(IS_FRESH(seed[0], MLKEM_SYMBYTES + 2))
@@ -299,6 +299,8 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
       }
     }
 
+    // This call writes across polyvec boundaries for K=2 and K=3.
+    // This is intentional and safe.
     gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
   }
 

From 6833bbf8fc8e45c6dde0a3d25dbe9344970fb77f Mon Sep 17 00:00:00 2001
From: Hanno Becker <beckphan@amazon.co.uk>
Date: Fri, 8 Nov 2024 06:41:44 +0000
Subject: [PATCH 5/5] CBMC: Add spec + proof for gen_matrix

All loops in gen_matrix() are unrolled by CBMC. The loop
handling residual polynomials after x4-batched entry
generation is replaced by conditional generation of a
single polynomial: For K=2 and K=4, batching gives the
full matrix, while for K=3, only one polynomial remains.

Signed-off-by: Hanno Becker <beckphan@amazon.co.uk>
---
 cbmc/proofs/gen_matrix/Makefile             | 54 +++++++++++++++++++++
 cbmc/proofs/gen_matrix/cbmc-proof.txt       |  3 ++
 cbmc/proofs/gen_matrix/gen_matrix_harness.c | 30 ++++++++++++
 mlkem/indcpa.c                              |  7 ++-
 mlkem/indcpa.h                              | 11 ++++-
 5 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 cbmc/proofs/gen_matrix/Makefile
 create mode 100644 cbmc/proofs/gen_matrix/cbmc-proof.txt
 create mode 100644 cbmc/proofs/gen_matrix/gen_matrix_harness.c

diff --git a/cbmc/proofs/gen_matrix/Makefile b/cbmc/proofs/gen_matrix/Makefile
new file mode 100644
index 000000000..eb3d76228
--- /dev/null
+++ b/cbmc/proofs/gen_matrix/Makefile
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+
+include ../Makefile_params.common
+
+HARNESS_ENTRY = harness
+HARNESS_FILE = gen_matrix_harness
+
+# This should be a unique identifier for this proof, and will appear on the
+# Litani dashboard. It can be human-readable and contain spaces if you wish.
+PROOF_UID = gen_matrix
+
+DEFINES +=
+INCLUDES +=
+
+REMOVE_FUNCTION_BODY +=
+UNWINDSET += $(MLKEM_NAMESPACE)gen_matrix.0:4 $(MLKEM_NAMESPACE)gen_matrix.1:4 $(MLKEM_NAMESPACE)gen_matrix.2:4
+
+PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
+PROJECT_SOURCES += $(SRCDIR)/mlkem/indcpa.c
+
+CHECK_FUNCTION_CONTRACTS=$(MLKEM_NAMESPACE)gen_matrix
+USE_FUNCTION_CONTRACTS=gen_matrix_entry gen_matrix_entry_x4
+APPLY_LOOP_CONTRACTS=on
+USE_DYNAMIC_FRAMES=1
+
+# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
+EXTERNAL_SAT_SOLVER=
+CBMCFLAGS=--smt2
+
+FUNCTION_NAME = $(MLKEM_NAMESPACE)gen_matrix
+
+# If this proof is found to consume huge amounts of RAM, you can set the
+# EXPENSIVE variable. With new enough versions of the proof tools, this will
+# restrict the number of EXPENSIVE CBMC jobs running at once. See the
+# documentation in Makefile.common under the "Job Pools" heading for details.
+# EXPENSIVE = true
+
+# This function is large enough to need...
+CBMC_OBJECT_BITS = 8
+
+# If you require access to a file-local ("static") function or object to conduct
+# your proof, set the following (and do not include the original source file
+# ("mlkem/poly.c") in PROJECT_SOURCES).
+# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i
+# include ../Makefile.common
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mlkem/poly.c
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar
+# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz
+# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must
+# be set before including Makefile.common, but any use of variables on the
+# left-hand side requires those variables to be defined. Hence, _SOURCE,
+# _FUNCTIONS, _OBJECTS is set after including Makefile.common.
+
+include ../Makefile.common
diff --git a/cbmc/proofs/gen_matrix/cbmc-proof.txt b/cbmc/proofs/gen_matrix/cbmc-proof.txt
new file mode 100644
index 000000000..3d1913ebf
--- /dev/null
+++ b/cbmc/proofs/gen_matrix/cbmc-proof.txt
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# This file marks this directory as containing a CBMC proof.
diff --git a/cbmc/proofs/gen_matrix/gen_matrix_harness.c b/cbmc/proofs/gen_matrix/gen_matrix_harness.c
new file mode 100644
index 000000000..64a78cd1d
--- /dev/null
+++ b/cbmc/proofs/gen_matrix/gen_matrix_harness.c
@@ -0,0 +1,30 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT-0
+
+/*
+ * Insert copyright notice
+ */
+
+/**
+ * @file gen_matrix_harness.c
+ * @brief Implements the proof harness for gen_matrix function.
+ */
+
+/*
+ * Insert project header files that
+ *   - include the declaration of the function
+ *   - include the types needed to declare function arguments
+ */
+#include <stdint.h>
+#include "indcpa.h"
+
+/**
+ * @brief Starting point for formal analysis
+ *
+ */
+void harness(void) {
+  polyvec *a;
+  uint8_t *seed;
+  int transposed;
+  gen_matrix(a, seed, transposed);
+}
diff --git a/mlkem/indcpa.c b/mlkem/indcpa.c
index db066d684..2522a9ac5 100644
--- a/mlkem/indcpa.c
+++ b/mlkem/indcpa.c
@@ -304,8 +304,8 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
     gen_matrix_entry_x4(&a[0].vec[0] + i, seedxy);
   }
 
-  // For left over vector, we use single keccak.
-  for (; i < MLKEM_K * MLKEM_K; i++) {
+  // For left over polynomial, we use single keccak.
+  if (i < MLKEM_K * MLKEM_K) {
     uint8_t x, y;
     x = i / MLKEM_K;
     y = i % MLKEM_K;
@@ -319,8 +319,11 @@ void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES],
     }
 
     gen_matrix_entry(&a[0].vec[0] + i, seed0);
+    i++;
   }
 
+  ASSERT(i == MLKEM_K * MLKEM_K, "gen_matrix: failed to generate whole matrix");
+
 #if defined(MLKEM_USE_NATIVE_NTT_CUSTOM_ORDER)
   // The public matrix is generated in NTT domain. If the native backend
   // uses a custom order in NTT domain, permute A accordingly.
diff --git a/mlkem/indcpa.h b/mlkem/indcpa.h
index 71d062698..c78be02ec 100644
--- a/mlkem/indcpa.h
+++ b/mlkem/indcpa.h
@@ -3,11 +3,20 @@
 #define INDCPA_H
 
 #include <stdint.h>
+#include "cbmc.h"
 #include "params.h"
 #include "polyvec.h"
 
+// clang-format off
 #define gen_matrix MLKEM_NAMESPACE(gen_matrix)
-void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed);
+void gen_matrix(polyvec *a, const uint8_t seed[MLKEM_SYMBYTES], int transposed)
+REQUIRES(IS_FRESH(a, sizeof(polyvec) * MLKEM_K))
+REQUIRES(IS_FRESH(seed, MLKEM_SYMBYTES))
+REQUIRES(transposed == 0 || transposed == 1)
+ASSIGNS(OBJECT_UPTO(a, sizeof(polyvec) * MLKEM_K))
+ENSURES(FORALL(int, x, 0, MLKEM_K - 1, FORALL(int, y, 0, MLKEM_K - 1,
+  ARRAY_IN_BOUNDS(int, i, 0, MLKEM_N - 1, a[x].vec[y].coeffs, 0, (MLKEM_Q - 1)))))
+;  // clang-format on
 
 #define indcpa_keypair_derand MLKEM_NAMESPACE(indcpa_keypair_derand)
 void indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],