Skip to content

Commit

Permalink
address review 3 comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pittma committed Jul 25, 2024
1 parent 087bf5c commit c439bf0
Show file tree
Hide file tree
Showing 5 changed files with 367 additions and 357 deletions.
8 changes: 4 additions & 4 deletions crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
Original file line number Diff line number Diff line change
Expand Up @@ -381,10 +381,10 @@ sub amm52x20_x1_norm {

###############################################################################
# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
# const BN_ULONG a[2][20],
# const BN_ULONG b[2][20],
# const BN_ULONG m[2][20],
# const BN_ULONG k0[2]);
# const BN_ULONG a[2][20],
# const BN_ULONG b[2][20],
# const BN_ULONG m[2][20],
# const BN_ULONG k0[2]);
###############################################################################

$code.=<<___;
Expand Down
2 changes: 1 addition & 1 deletion crypto/fipsmodule/bn/exponentiation.c
Original file line number Diff line number Diff line change
Expand Up @@ -1324,7 +1324,7 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1
}

int mod_bits = BN_num_bits(m1);
ret = rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
ret = RSAZ_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
in_mont1->RR.d, in_mont1->n0[0],
rr2->d, a2->d, p2->d, m2->d,
in_mont2->RR.d, in_mont2->n0[0],
Expand Down
170 changes: 0 additions & 170 deletions crypto/fipsmodule/bn/internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,176 +805,6 @@ void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len);


// Naming convention for the following functions:
//
// * amm: Almost Montgomery Multiplication
// * ams: Almost Montgomery Squaring
// * 52xZZ: data represented as array of ZZ digits in 52-bit radix
// * _x1_/_x2_: 1 or 2 independent inputs/outputs
// * ifma256: uses 256-bit wide IFMA ISA (AVX512_IFMA256)
//
//
// Almost Montgomery Multiplication (AMM) for 20-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high
// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 =
// -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 1040 > 1024 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
BN_ULONG k0);

// Dual Almost Montgomery Multiplication for 20-digit number in radix
// 2^52
//
// See description of rsaz_amm52x20_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
void rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
const BN_ULONG k0[2]);

// Constant time extraction from the precomputed table of powers
// base^i, where i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent
// base values. |red_table_idx1| and |red_table_idx2| are
// corresponding power indexes.
//
// Extracted value (output) is 2 20 digit numbers in 2^52 radix.
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x20_win5(BN_ULONG *red_Y,
const BN_ULONG *red_table,
int red_table_idx1, int red_table_idx2);

// Almost Montgomery Multiplication (AMM) for 30-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high
// bits zeroed
//
// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
//
// |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 1560 > 1536 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
BN_ULONG k0);
// Dual Almost Montgomery Multiplication for 30-digit number in radix
// 2^52
//
// See description of rsaz_amm52x30_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
//
// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
void rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
const BN_ULONG k0[2]);

// Constant time extraction from the precomputed table of powers
// base^i, where i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent
// base values. |red_table_idx1| and |red_table_idx2| are
// corresponding power indexes.
//
// Extracted value (output) is 2 (30 + 2) digits numbers in 2^52
// radix. (2 high QW is zero padding)
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x30_win5(BN_ULONG *red_Y,
const BN_ULONG *red_table,
int red_table_idx1, int red_table_idx2);

// Almost Montgomery Multiplication (AMM) for 40-digit number in radix
// 2^52.
//
// AMM is defined as presented in the paper [1].
//
// The input and output are presented in 2^52 radix domain, i.e.
// |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high
// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 =
// -1/m mod 2^64
//
// NB: the AMM implementation does not perform "conditional"
// subtraction step specified in the original algorithm as according
// to the Lemma 1 from the paper [2], the result will be always < 2*m
// and can be used as a direct input to the next AMM iteration. This
// post-condition is true, provided the correct parameter |s| (notion
// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which
// matches our case: 2080 > 2048 + 2 * 1.
//
// [1] Gueron, S. Efficient software implementations of modular
// exponentiation. DOI: 10.1007/s13389-012-0031-5
// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI:
// 10.1007/3-540-36400-5_5
void rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
BN_ULONG k0);

// Dual Almost Montgomery Multiplication for 40-digit number in radix
// 2^52
//
// See description of rsaz_amm52x40_x1_ifma256() above for
// details about Almost Montgomery Multiplication algorithm and
// function input parameters description.
//
// This function does two AMMs for two independent inputs, hence dual.
void rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
const BN_ULONG *b, const BN_ULONG *m,
const BN_ULONG k0[2]);

// Constant time extraction from the precomputed table of powers base^i, where
// i = 0..2^EXP_WIN_SIZE-1
//
// The input |red_table| contains precomputations for two independent base values.
// |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
//
// Extracted value (output) is 2 40 digits numbers in 2^52 radix.
//
// EXP_WIN_SIZE = 5
void extract_multiplier_2x40_win5(BN_ULONG *red_Y,
const BN_ULONG *red_table,
int red_table_idx1, int red_table_idx2);


#if defined(__cplusplus)
} // extern C
#endif
Expand Down
Loading

0 comments on commit c439bf0

Please sign in to comment.