Skip to content

Commit

Permalink
Merge pull request #511 from pq-code-package/reduce_doc
Browse files Browse the repository at this point in the history
reduce.c: Simplify bounds documentation and fix runtime assertion
  • Loading branch information
hanno-becker authored Dec 12, 2024
2 parents a3e415c + 506e0ea commit 5bfad46
Showing 1 changed file with 21 additions and 26 deletions.
47 changes: 21 additions & 26 deletions mlkem/reduce.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,36 +45,13 @@ static INLINE int16_t cast_uint16_to_int16(uint16_t x)
*
* Arguments: - int32_t a: input integer to be reduced
*
* Returns: integer congruent to a * R^-1 modulo q
* Returns: integer congruent to a * R^-1 modulo q, with absolute value
* <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
*
* Bounds: For any C such that |a| < q * C, the return value
* has absolute value < q (C/2^16 + 1/2).
*
* Notable special cases:
* - The Montgomery multiplication of a value of absolute value
* < q * C with a signed-canonical value ( < q/2 ) has
* absolute value q * (0.0254 * C + 1/2).
* - The Montgomery multiplication of a value of absolute value
* < q * C with a value t of |t| < q has absolute value
* < q * (0.0508 * C + 1/2).
* - The Montgomery multiplication of a value of absolute value
* < C with a value of abs < q has absolute value
* < q (C/2^16 + 1/2).
**************************************************/
ALWAYS_INLINE
static INLINE int16_t montgomery_reduce_generic(int32_t a)
{
/*
*Bounds on paper
* - Case |a| < q * C, for some C
* |t| <= |a|/2^16 + |t|*q/2^16
* < q * C / 2^16 + q/2
* = q (C/2^16 + 1/2)
* - Case |a| < (q/2) * C * q, for some C
* Replace C -> C * q in the above and estimate
* q / 2^17 < 0.0254.
*/

/* Compute a*q^{-1} mod 2^16 in unsigned representatives */
const uint16_t a_reduced = a & UINT16_MAX;
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
Expand All @@ -83,13 +60,20 @@ static INLINE int16_t montgomery_reduce_generic(int32_t a)
const int16_t t = cast_uint16_to_int16(a_inverted);

int32_t r = a - ((int32_t)t * MLKEM_Q);
/* Bounds: |r| <= |a| + 2^15 * MLKEM_Q */

/*
* PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
* implementation-defined for negative left argument. Here,
* we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
*/
r = r >> 16;
/* Bounds: |r >> 16| <= ceil(|r| / 2^16)
* <= ceil(|a| / 2^16 + MLKEM_Q / 2)
* <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
*
* (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
*/

return (int16_t)r;
}
Expand All @@ -100,8 +84,13 @@ int16_t montgomery_reduce(int32_t a)
SCALAR_BOUND(a, 2 * UINT12_MAX * 32768, "montgomery_reduce input");

res = montgomery_reduce_generic(a);
/* Bounds:
* |res| <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil(2 * UINT12_MAX * 32768 / 65536) + (MLKEM_Q + 1) / 2
* <= UINT12_MAX + (MLKEM_Q + 1) / 2
* < 2 * MLKEM_Q */

SCALAR_BOUND(res, (3 * (MLKEM_Q + 1)) / 2, "montgomery_reduce output");
SCALAR_BOUND(res, 2 * MLKEM_Q, "montgomery_reduce output");
return res;
}

Expand All @@ -111,6 +100,12 @@ int16_t fqmul(int16_t a, int16_t b)
SCALAR_BOUND(b, HALF_Q, "fqmul input");

res = montgomery_reduce((int32_t)a * (int32_t)b);
/* Bounds:
* |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
* < MLKEM_Q
*/

SCALAR_BOUND(res, MLKEM_Q, "fqmul output");
return res;
Expand Down

18 comments on commit 5bfad46

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A76 (Raspberry Pi 5) benchmarks

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 29185 cycles 29185 cycles 1
ML-KEM-512 encaps 35550 cycles 35551 cycles 1.00
ML-KEM-512 decaps 46100 cycles 46102 cycles 1.00
ML-KEM-768 keypair 49233 cycles 49235 cycles 1.00
ML-KEM-768 encaps 55500 cycles 55497 cycles 1.00
ML-KEM-768 decaps 70227 cycles 70228 cycles 1.00
ML-KEM-1024 keypair 72246 cycles 72247 cycles 1.00
ML-KEM-1024 encaps 81106 cycles 81116 cycles 1.00
ML-KEM-1024 decaps 100950 cycles 100953 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 13915 cycles 13510 cycles 1.03
ML-KEM-512 encaps 17245 cycles 17301 cycles 1.00
ML-KEM-512 decaps 23087 cycles 22838 cycles 1.01
ML-KEM-768 keypair 22494 cycles 22517 cycles 1.00
ML-KEM-768 encaps 24579 cycles 24520 cycles 1.00
ML-KEM-768 decaps 32576 cycles 32555 cycles 1.00
ML-KEM-1024 keypair 31387 cycles 31395 cycles 1.00
ML-KEM-1024 encaps 34871 cycles 34905 cycles 1.00
ML-KEM-1024 decaps 45701 cycles 45825 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 18097 cycles 18106 cycles 1.00
ML-KEM-512 encaps 23140 cycles 23149 cycles 1.00
ML-KEM-512 decaps 30443 cycles 30463 cycles 1.00
ML-KEM-768 keypair 31128 cycles 31092 cycles 1.00
ML-KEM-768 encaps 34204 cycles 34124 cycles 1.00
ML-KEM-768 decaps 44656 cycles 44663 cycles 1.00
ML-KEM-1024 keypair 44576 cycles 44598 cycles 1.00
ML-KEM-1024 encaps 49890 cycles 49915 cycles 1.00
ML-KEM-1024 decaps 64382 cycles 64406 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 4th gen (c7i) (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 35446 cycles 35549 cycles 1.00
ML-KEM-512 encaps 44054 cycles 43926 cycles 1.00
ML-KEM-512 decaps 58722 cycles 58454 cycles 1.00
ML-KEM-768 keypair 59028 cycles 58968 cycles 1.00
ML-KEM-768 encaps 70349 cycles 70270 cycles 1.00
ML-KEM-768 decaps 89084 cycles 89052 cycles 1.00
ML-KEM-1024 keypair 86482 cycles 86573 cycles 1.00
ML-KEM-1024 encaps 102183 cycles 102066 cycles 1.00
ML-KEM-1024 decaps 126319 cycles 125645 cycles 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 14882 cycles 14880 cycles 1.00
ML-KEM-512 encaps 19683 cycles 19705 cycles 1.00
ML-KEM-512 decaps 26353 cycles 26384 cycles 1.00
ML-KEM-768 keypair 25552 cycles 25559 cycles 1.00
ML-KEM-768 encaps 28103 cycles 28094 cycles 1.00
ML-KEM-768 decaps 37813 cycles 37808 cycles 1.00
ML-KEM-1024 keypair 35615 cycles 35636 cycles 1.00
ML-KEM-1024 encaps 41001 cycles 40946 cycles 1.00
ML-KEM-1024 decaps 54437 cycles 54436 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 18204 cycles 18205 cycles 1.00
ML-KEM-512 encaps 22233 cycles 22234 cycles 1.00
ML-KEM-512 decaps 28995 cycles 28993 cycles 1.00
ML-KEM-768 keypair 30701 cycles 30696 cycles 1.00
ML-KEM-768 encaps 33743 cycles 33742 cycles 1.00
ML-KEM-768 decaps 43339 cycles 43337 cycles 1.00
ML-KEM-1024 keypair 44358 cycles 44364 cycles 1.00
ML-KEM-1024 encaps 49793 cycles 49797 cycles 1.00
ML-KEM-1024 decaps 62876 cycles 62878 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 20329 cycles 20325 cycles 1.00
ML-KEM-512 encaps 26912 cycles 26919 cycles 1.00
ML-KEM-512 decaps 35742 cycles 35730 cycles 1.00
ML-KEM-768 keypair 34866 cycles 34866 cycles 1
ML-KEM-768 encaps 38220 cycles 38205 cycles 1.00
ML-KEM-768 decaps 50921 cycles 50936 cycles 1.00
ML-KEM-1024 keypair 47945 cycles 47962 cycles 1.00
ML-KEM-1024 encaps 53961 cycles 53929 cycles 1.00
ML-KEM-1024 decaps 71510 cycles 71531 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 18989 cycles 18990 cycles 1.00
ML-KEM-512 encaps 23581 cycles 23584 cycles 1.00
ML-KEM-512 decaps 30740 cycles 30742 cycles 1.00
ML-KEM-768 keypair 32251 cycles 32252 cycles 1.00
ML-KEM-768 encaps 35685 cycles 35686 cycles 1.00
ML-KEM-768 decaps 45867 cycles 45867 cycles 1
ML-KEM-1024 keypair 46806 cycles 46807 cycles 1.00
ML-KEM-1024 encaps 52555 cycles 52556 cycles 1.00
ML-KEM-1024 decaps 66451 cycles 66444 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 3rd gen (c6a) (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 52306 cycles 52341 cycles 1.00
ML-KEM-512 encaps 65909 cycles 65948 cycles 1.00
ML-KEM-512 decaps 88556 cycles 88633 cycles 1.00
ML-KEM-768 keypair 85600 cycles 85662 cycles 1.00
ML-KEM-768 encaps 101705 cycles 101779 cycles 1.00
ML-KEM-768 decaps 132595 cycles 132638 cycles 1.00
ML-KEM-1024 keypair 124943 cycles 125105 cycles 1.00
ML-KEM-1024 encaps 146105 cycles 146318 cycles 1.00
ML-KEM-1024 decaps 184591 cycles 184560 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 29202 cycles 29195 cycles 1.00
ML-KEM-512 encaps 35572 cycles 35562 cycles 1.00
ML-KEM-512 decaps 46126 cycles 46112 cycles 1.00
ML-KEM-768 keypair 49229 cycles 49249 cycles 1.00
ML-KEM-768 encaps 55496 cycles 55528 cycles 1.00
ML-KEM-768 decaps 70212 cycles 70237 cycles 1.00
ML-KEM-1024 keypair 72343 cycles 72332 cycles 1.00
ML-KEM-1024 encaps 81104 cycles 81084 cycles 1.00
ML-KEM-1024 decaps 100950 cycles 100928 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AMD EPYC 4th gen (c7a) (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 45737 cycles 45754 cycles 1.00
ML-KEM-512 encaps 56887 cycles 56918 cycles 1.00
ML-KEM-512 decaps 76259 cycles 76297 cycles 1.00
ML-KEM-768 keypair 74542 cycles 74524 cycles 1.00
ML-KEM-768 encaps 88462 cycles 88134 cycles 1.00
ML-KEM-768 decaps 113874 cycles 113829 cycles 1.00
ML-KEM-1024 keypair 109401 cycles 109467 cycles 1.00
ML-KEM-1024 encaps 126981 cycles 127078 cycles 1.00
ML-KEM-1024 decaps 159463 cycles 159481 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton4 (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 41978 cycles 41968 cycles 1.00
ML-KEM-512 encaps 50179 cycles 50179 cycles 1
ML-KEM-512 decaps 66065 cycles 66064 cycles 1.00
ML-KEM-768 keypair 69065 cycles 69062 cycles 1.00
ML-KEM-768 encaps 79787 cycles 79786 cycles 1.00
ML-KEM-768 decaps 101039 cycles 101039 cycles 1
ML-KEM-1024 keypair 102242 cycles 102497 cycles 1.00
ML-KEM-1024 encaps 117257 cycles 117492 cycles 1.00
ML-KEM-1024 decaps 143770 cycles 143470 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intel Xeon 3rd gen (c6i) (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 56797 cycles 56828 cycles 1.00
ML-KEM-512 encaps 69370 cycles 69420 cycles 1.00
ML-KEM-512 decaps 91193 cycles 91195 cycles 1.00
ML-KEM-768 keypair 91995 cycles 92099 cycles 1.00
ML-KEM-768 encaps 107709 cycles 107870 cycles 1.00
ML-KEM-768 decaps 136285 cycles 136514 cycles 1.00
ML-KEM-1024 keypair 135037 cycles 135097 cycles 1.00
ML-KEM-1024 encaps 155441 cycles 155495 cycles 1.00
ML-KEM-1024 decaps 191923 cycles 191987 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton3 (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 45347 cycles 45349 cycles 1.00
ML-KEM-512 encaps 54188 cycles 54189 cycles 1.00
ML-KEM-512 decaps 71115 cycles 71113 cycles 1.00
ML-KEM-768 keypair 74820 cycles 74811 cycles 1.00
ML-KEM-768 encaps 86067 cycles 86070 cycles 1.00
ML-KEM-768 decaps 108761 cycles 108766 cycles 1.00
ML-KEM-1024 keypair 111101 cycles 111080 cycles 1.00
ML-KEM-1024 encaps 125977 cycles 125976 cycles 1.00
ML-KEM-1024 decaps 154633 cycles 154628 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Graviton2 (no-opt)

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 71170 cycles 71113 cycles 1.00
ML-KEM-512 encaps 85068 cycles 85069 cycles 1.00
ML-KEM-512 decaps 112744 cycles 112755 cycles 1.00
ML-KEM-768 keypair 117782 cycles 117788 cycles 1.00
ML-KEM-768 encaps 135418 cycles 135434 cycles 1.00
ML-KEM-768 decaps 171942 cycles 171961 cycles 1.00
ML-KEM-1024 keypair 175236 cycles 175359 cycles 1.00
ML-KEM-1024 encaps 197359 cycles 197409 cycles 1.00
ML-KEM-1024 decaps 243558 cycles 243621 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bananapi bpi-f3 benchmarks

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 368957 cycles 368942 cycles 1.00
ML-KEM-512 encaps 508177 cycles 508179 cycles 1.00
ML-KEM-512 decaps 682344 cycles 682312 cycles 1.00
ML-KEM-768 keypair 619562 cycles 619600 cycles 1.00
ML-KEM-768 encaps 799242 cycles 799279 cycles 1.00
ML-KEM-768 decaps 1025243 cycles 1025223 cycles 1.00
ML-KEM-1024 keypair 922566 cycles 922629 cycles 1.00
ML-KEM-1024 encaps 1146508 cycles 1147343 cycles 1.00
ML-KEM-1024 decaps 1420999 cycles 1420196 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A55 (Snapdragon 888) benchmarks

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 57942 cycles 57917 cycles 1.00
ML-KEM-512 encaps 65237 cycles 65215 cycles 1.00
ML-KEM-512 decaps 83893 cycles 83905 cycles 1.00
ML-KEM-768 keypair 98093 cycles 98057 cycles 1.00
ML-KEM-768 encaps 108820 cycles 109135 cycles 1.00
ML-KEM-768 decaps 135970 cycles 136044 cycles 1.00
ML-KEM-1024 keypair 148663 cycles 148734 cycles 1.00
ML-KEM-1024 encaps 164468 cycles 165251 cycles 1.00
ML-KEM-1024 decaps 200064 cycles 200515 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

@oqs-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arm Cortex-A72 (Raspberry Pi 4) benchmarks

Benchmark suite Current: 5bfad46 Previous: a3e415c Ratio
ML-KEM-512 keypair 51454 cycles 51635 cycles 1.00
ML-KEM-512 encaps 57979 cycles 58258 cycles 1.00
ML-KEM-512 decaps 73529 cycles 73876 cycles 1.00
ML-KEM-768 keypair 87843 cycles 88910 cycles 0.99
ML-KEM-768 encaps 96159 cycles 97385 cycles 0.99
ML-KEM-768 decaps 119337 cycles 119980 cycles 0.99
ML-KEM-1024 keypair 131980 cycles 132451 cycles 1.00
ML-KEM-1024 encaps 144414 cycles 145660 cycles 0.99
ML-KEM-1024 decaps 175666 cycles 175776 cycles 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.