diff --git a/crypto_kem/ml-kem-512/m4fspeed/poly.c b/crypto_kem/ml-kem-512/m4fspeed/poly.c
index 401b26b7..5a6d0abb 100644
--- a/crypto_kem/ml-kem-512/m4fspeed/poly.c
+++ b/crypto_kem/ml-kem-512/m4fspeed/poly.c
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
     int i;
     uint16_t t0, t1;
 
+    poly_reduce(a);
     poly_reduce(a);
 
     for (i = 0; i < KYBER_N / 2; i++) {
diff --git a/crypto_kem/ml-kem-512/m4fstack/poly.c b/crypto_kem/ml-kem-512/m4fstack/poly.c
index 443fdbae..ed54ec87 100644
--- a/crypto_kem/ml-kem-512/m4fstack/poly.c
+++ b/crypto_kem/ml-kem-512/m4fstack/poly.c
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
     int i;
     uint16_t t0, t1;
 
+    poly_reduce(a);
     poly_reduce(a);
 
     for (i = 0; i < KYBER_N / 2; i++) {
diff --git a/crypto_kem/ml-kem-768/m4fspeed/poly.c b/crypto_kem/ml-kem-768/m4fspeed/poly.c
index b52060f9..b909a85c 100644
--- a/crypto_kem/ml-kem-768/m4fspeed/poly.c
+++ b/crypto_kem/ml-kem-768/m4fspeed/poly.c
@@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) {
         d0 >>= 31;
         t[k] = d0 & 0x7ff;
       }
-      
+
 
     r[352*i+11*j+ 0] =  t[0] & 0xff;
     r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
     int i;
     uint16_t t0, t1;
 
+    poly_reduce(a);
     poly_reduce(a);
 
     for (i = 0; i < KYBER_N / 2; i++) {
@@ -465,7 +466,7 @@ void poly_noise(poly *r, const unsigned char *seed, unsigned char nonce, int add
 *              Using strategy of better accumulation (initial step).
 * Arguments:   - const poly *a:       pointer to input polynomial
 *              - const poly *b:       pointer to input polynomial
-*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas
 *              - int32_t *r_tmp:      array for accumulating unreduced results
 **************************************************/
 extern void basemul_asm_opt_16_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
@@ -481,7 +482,7 @@ void poly_basemul_opt_16_32(int32_t *r_tmp, const poly *a, const poly *b, const
 *              Using strategy of better accumulation.
 * Arguments:   - const poly *a:       pointer to input polynomial
 *              - const poly *b:       pointer to input polynomial
-*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas
 *              - int32_t *r_tmp:      array for accumulating unreduced results
 **************************************************/
 extern void basemul_asm_acc_opt_32_32(int32_t *, const int16_t *, const int16_t *, const int16_t *);
@@ -497,7 +498,7 @@ void poly_basemul_acc_opt_32_32(int32_t *r_tmp, const poly *a, const poly *b, co
 *              Using strategy of better accumulation (final step).
 * Arguments:   - const poly *a:       pointer to input polynomial
 *              - const poly *b:       pointer to input polynomial
-*              - const poly *a_prime: pointer to a pre-multiplied by zetas 
+*              - const poly *a_prime: pointer to a pre-multiplied by zetas
 *              - poly *r:             pointer to output polynomial
 *              - int32_t *r_tmp:      array for accumulating unreduced results
 **************************************************/
diff --git a/crypto_kem/ml-kem-768/m4fstack/poly.c b/crypto_kem/ml-kem-768/m4fstack/poly.c
index 35475adb..fb13d155 100644
--- a/crypto_kem/ml-kem-768/m4fstack/poly.c
+++ b/crypto_kem/ml-kem-768/m4fstack/poly.c
@@ -142,7 +142,7 @@ void poly_packcompress(unsigned char *r, poly *a, int i) {
         d0 >>= 31;
         t[k] = d0 & 0x7ff;
       }
-      
+
 
     r[352*i+11*j+ 0] =  t[0] & 0xff;
     r[352*i+11*j+ 1] = (t[0] >>  8) | ((t[1] & 0x1f) << 3);
@@ -366,6 +366,7 @@ void poly_tobytes(unsigned char *r, poly *a) {
     int i;
     uint16_t t0, t1;
 
+    poly_reduce(a);
     poly_reduce(a);
 
     for (i = 0; i < KYBER_N / 2; i++) {