From 80f984ef78203bbdb61a2ef3fee55d8956ad7e8c Mon Sep 17 00:00:00 2001
From: dkostic <25055813+dkostic@users.noreply.github.com>
Date: Fri, 22 Nov 2024 10:48:06 -0800
Subject: [PATCH] [EC] Unify scalar_mul_base point for ec_nistp curves (#2003)

Added unified scalar multiplication of the base point for curves
implemented in ec_nistp. This is a refactor of the algorithm in
p384.c and p521.c that makes it generic. The implementation
in p384.c and p521.c is substituted with this new unified
implementation.
---
 crypto/fipsmodule/ec/ec_nistp.c | 140 ++++++++++++++++++++++++++++-
 crypto/fipsmodule/ec/ec_nistp.h |   8 ++
 crypto/fipsmodule/ec/p384.c     | 155 ++------------------------------
 crypto/fipsmodule/ec/p521.c     | 153 ++-----------------------------
 4 files changed, 163 insertions(+), 293 deletions(-)

diff --git a/crypto/fipsmodule/ec/ec_nistp.c b/crypto/fipsmodule/ec/ec_nistp.c
index e370866f85..f3025ee3bd 100644
--- a/crypto/fipsmodule/ec/ec_nistp.c
+++ b/crypto/fipsmodule/ec/ec_nistp.c
@@ -17,7 +17,7 @@
 // |----------------------------|
 // | 1. |   x   |   x   |   x*  |
 // | 2. |   x   |   x   |   x*  |
-// | 3. |       |       |       |
+// | 3. |   x   |   x   |       |
 // | 4. |   x   |   x   |   x*  |
 // | 5. |       |       |       |
 //  * For P-256, only the Fiat-crypto implementation in p256.c is replaced. 
@@ -498,3 +498,141 @@ void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
   cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
   cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
 }
+
+// Multiplication of the base point G of the curve with the given scalar.
+// The product is computed with the Comb method using a precomputed table
+// and the regular-wNAF scalar encoding.
+//
+// While the algorithm is generic and works for different curves, window sizes,
+// and scalar sizes, for clarity, we describe it by using the example of P-521.
+//
+// The precomputed table has 27 sub-tables each holding 16 points:
+//
+//      0 :       [1]G,       [3]G,  ...,       [31]G
+//      1 :  [1*2^20]G,  [3*2^20]G,  ...,  [31*2^20]G
+//                         ...
+//      i : [1*2^20i]G, [3*2^20i]G,  ..., [31*2^20i]G
+//                         ...
+//     26 :   [2^520]G, [3*2^520]G,  ..., [31*2^520]G
+// Computing the negation of a point P = (x, y) is relatively easy:
+//     -P = (x, -y).
+// So we may assume that for each sub-table we have 32 points instead of 16:
+//     [\pm 1*2^20i]G, [\pm 3*2^20i]G, ..., [\pm 31*2^20i]G.
+//
+// The 521-bit |scalar| is recoded (regular-wNAF encoding) into 105 signed
+// digits, each of length 5 bits, as explained in the
+// |p521_felem_mul_scalar_rwnaf| function. Namely,
+//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_104*2^520,
+// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
+// scalar we have that scalar = scalar', while in the case of an even
+// scalar we have that scalar = scalar' - 1.
+//
+// To compute the required product, [scalar]G, we may do the following.
+// Group the recoded digits of the scalar in 4 groups:
+//                                            |   corresponding multiples in
+//                  digits                    |   the recoded representation
+//   -------------------------------------------------------------------------
+//   (0): {s_0, s_4,  s_8, ..., s_100, s_104} |  { 2^0, 2^20, ..., 2^500, 2^520}
+//   (1): {s_1, s_5,  s_9, ..., s_101}        |  { 2^5, 2^25, ..., 2^505}
+//   (2): {s_2, s_6, s_10, ..., s_102}        |  {2^10, 2^30, ..., 2^510}
+//   (3): {s_3, s_7, s_11, ..., s_103}        |  {2^15, 2^35, ..., 2^515}
+//        corresponding sub-table lookup      |  {  T0,   T1, ...,   T25,   T26}
+//
+// The group (0) digits correspond precisely to the multiples of G that are
+// held in the 27 precomputed sub-tables, so we may simply read the appropriate
+// points from the sub-tables and sum them all up (negating if needed, i.e., if
+// a digit s_i is negative, we read the point corresponding to the abs(s_i) and
+// negate it before adding it to the sum).
+// The remaining three groups (1), (2), and (3), correspond to the multiples
+// of G from the sub-tables multiplied additionally by 2^5, 2^10, and 2^15,
+// respectively. Therefore, for these groups we may read the appropriate points
+// from the table, double them 5, 10, or 15 times, respectively, and add them
+// to the final result.
+//
+// To minimize the number of required doubling operations we process the digits
+// of the scalar from left to right. In other words, the algorithm is:
+//   1. For group (i) in this order (3, 2, 1, 0):
+//   2.   Double the accumulator 5 times except in the first iteration.
+//   3.   Read the points corresponding to the group (i) digits from the tables
+//        and add them to an accumulator.
+//   4. If the scalar is even subtract G from the accumulator.
+//
+// Note: this function is designed to be constant-time.
+void ec_nistp_scalar_mul_base(const ec_nistp_meth *ctx,
+                              ec_nistp_felem_limb *x_out,
+                              ec_nistp_felem_limb *y_out,
+                              ec_nistp_felem_limb *z_out,
+                              const EC_SCALAR *scalar) {
+  // Regular-wNAF encoding of the scalar.
+  int16_t rwnaf[SCALAR_MUL_MAX_NUM_WINDOWS];
+  scalar_rwnaf(rwnaf, SCALAR_MUL_WINDOW_SIZE, scalar, ctx->felem_num_bits);
+  size_t num_windows = DIV_AND_CEIL(ctx->felem_num_bits, SCALAR_MUL_WINDOW_SIZE);
+
+  // We need two point accumulators, so we define them of maximum size
+  // to avoid allocation, and just take pointers to individual coordinates.
+  // (This cruft will disapear when we refactor point_add/dbl to work with
+  // whole points instead of individual coordinates).
+  ec_nistp_felem_limb res[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
+  ec_nistp_felem_limb tmp[3 * FELEM_MAX_NUM_OF_LIMBS] = {0};
+  ec_nistp_felem_limb *x_res = &res[0];
+  ec_nistp_felem_limb *y_res = &res[ctx->felem_num_limbs];
+  ec_nistp_felem_limb *z_res = &res[ctx->felem_num_limbs * 2];
+  ec_nistp_felem_limb *x_tmp = &tmp[0];
+  ec_nistp_felem_limb *y_tmp = &tmp[ctx->felem_num_limbs];
+  ec_nistp_felem_limb *z_tmp = &tmp[ctx->felem_num_limbs * 2];
+
+  // Process the 4 groups of digits starting from group (3) down to group (0).
+  for (int i = 3; i >= 0; i--) {
+    // Double |res| 5 times in each iteration, except in the first one.
+    for (size_t j = 0; i != 3 && j < SCALAR_MUL_WINDOW_SIZE; j++) {
+      ctx->point_dbl(x_res, y_res, z_res, x_res, y_res, z_res);
+    }
+
+    // Process the digits in the current group from the most to the least
+    // significant one.
+    size_t start_idx = ((num_windows - i - 1) / 4) * 4 + i;
+
+    for (int j = start_idx; j >= 0; j -= 4) {
+      // For each digit |d| in the current group read the corresponding point
+      // from the table and add it to |res|. If |d| is negative, negate
+      // the point before adding it to |res|.
+      int16_t d = rwnaf[j];
+      int16_t is_neg = (d >> 15) & 1; // is_neg = (d < 0) ? 1 : 0
+      d = (d ^ -is_neg) + is_neg;     // d = abs(d)
+
+      int16_t idx = d >> 1;
+
+      // Select the point to add, in constant time.
+      size_t point_num_limbs = 2 * ctx->felem_num_limbs;  // Affine points.
+      size_t subtable_num_limbs = SCALAR_MUL_TABLE_NUM_POINTS * point_num_limbs;
+      size_t table_idx = (j / 4) * subtable_num_limbs;
+      const ec_nistp_felem_limb *table = &ctx->scalar_mul_base_table[table_idx];
+      select_point_from_table(ctx, tmp, table, idx, 0);
+
+      // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
+      ec_nistp_felem ftmp;
+      ctx->felem_neg(ftmp, y_tmp);
+
+      cmovznz(y_tmp, ctx->felem_num_limbs, is_neg, y_tmp, ftmp);
+
+      // Add the point to the accumulator |res|.
+      ctx->point_add(x_res, y_res, z_res, x_res, y_res, z_res, 1,
+                     x_tmp, y_tmp, ctx->felem_one);
+    }
+  }
+
+  // Conditionally subtract G if the scalar is even, in constant-time.
+  const ec_nistp_felem_limb *x_mp = &ctx->scalar_mul_base_table[0];
+  const ec_nistp_felem_limb *y_mp = &ctx->scalar_mul_base_table[ctx->felem_num_limbs];
+  ec_nistp_felem ftmp;
+  ctx->felem_neg(ftmp, y_mp);
+
+  // Subtract P from the accumulator.
+  ctx->point_add(x_tmp, y_tmp, z_tmp, x_res, y_res, z_res, 1, x_mp, ftmp, ctx->felem_one);
+
+  // Select |res| or |res - P| based on parity of the scalar.
+  ec_nistp_felem_limb t = scalar->words[0] & 1;
+  cmovznz(x_out, ctx->felem_num_limbs, t, x_tmp, x_res);
+  cmovznz(y_out, ctx->felem_num_limbs, t, y_tmp, y_res);
+  cmovznz(z_out, ctx->felem_num_limbs, t, z_tmp, z_res);
+}
diff --git a/crypto/fipsmodule/ec/ec_nistp.h b/crypto/fipsmodule/ec/ec_nistp.h
index d567dc18ee..13e143a88c 100644
--- a/crypto/fipsmodule/ec/ec_nistp.h
+++ b/crypto/fipsmodule/ec/ec_nistp.h
@@ -54,6 +54,7 @@ typedef struct {
   void (*felem_sqr)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a);
   void (*felem_neg)(ec_nistp_felem_limb *c, const ec_nistp_felem_limb *a);
   ec_nistp_felem_limb (*felem_nz)(const ec_nistp_felem_limb *a);
+  const ec_nistp_felem_limb *felem_one;
 
   void (*point_dbl)(ec_nistp_felem_limb *x_out,
                     ec_nistp_felem_limb *y_out,
@@ -72,6 +73,7 @@ typedef struct {
                     const ec_nistp_felem_limb *y2,
                     const ec_nistp_felem_limb *z2);
 
+  const ec_nistp_felem_limb *scalar_mul_base_table;
 } ec_nistp_meth;
 
 const ec_nistp_meth *p256_methods(void);
@@ -106,5 +108,11 @@ void ec_nistp_scalar_mul(const ec_nistp_meth *ctx,
                          const ec_nistp_felem_limb *y_in,
                          const ec_nistp_felem_limb *z_in,
                          const EC_SCALAR *scalar);
+
+void ec_nistp_scalar_mul_base(const ec_nistp_meth *ctx,
+                              ec_nistp_felem_limb *x_out,
+                              ec_nistp_felem_limb *y_out,
+                              ec_nistp_felem_limb *z_out,
+                              const EC_SCALAR *scalar);
 #endif // EC_NISTP_H
 
diff --git a/crypto/fipsmodule/ec/p384.c b/crypto/fipsmodule/ec/p384.c
index f11bee777b..e3e8063ab3 100644
--- a/crypto/fipsmodule/ec/p384.c
+++ b/crypto/fipsmodule/ec/p384.c
@@ -91,16 +91,6 @@ static void p384_felem_copy(p384_limb_t out[P384_NLIMBS],
   }
 }
 
-static void p384_felem_cmovznz(p384_limb_t out[P384_NLIMBS],
-                               p384_limb_t t,
-                               const p384_limb_t z[P384_NLIMBS],
-                               const p384_limb_t nz[P384_NLIMBS]) {
-  p384_limb_t mask = constant_time_is_zero_w(t);
-  for (size_t i = 0; i < P384_NLIMBS; i++) {
-    out[i] = constant_time_select_w(mask, z[i], nz[i]);
-  }
-}
-
 static void p384_from_generic(p384_felem out, const EC_FELEM *in) {
 #ifdef OPENSSL_BIG_ENDIAN
   uint8_t tmp[P384_EC_FELEM_BYTES];
@@ -270,6 +260,8 @@ static void p384_point_add(p384_felem x3, p384_felem y3, p384_felem z3,
   ec_nistp_point_add(p384_methods(), x3, y3, z3, x1, y1, z1, mixed, x2, y2, z2);
 }
 
+#include "p384_table.h"
+
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
     out->felem_num_limbs = P384_NLIMBS;
@@ -280,8 +272,10 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
     out->felem_sqr = bignum_montsqr_p384_selector;
     out->felem_neg = bignum_neg_p384;
     out->felem_nz  = p384_felem_nz;
+    out->felem_one = p384_felem_one;
     out->point_dbl = p384_point_double;
     out->point_add = p384_point_add;
+    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
 }
 #else
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
@@ -293,8 +287,10 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p384_methods) {
     out->felem_sqr = fiat_p384_square;
     out->felem_neg = fiat_p384_opp;
     out->felem_nz  = p384_felem_nz;
+    out->felem_one = p384_felem_one;
     out->point_dbl = p384_point_double;
     out->point_add = p384_point_add;
+    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p384_g_pre_comp;
 }
 #endif
 
@@ -494,20 +490,6 @@ OPENSSL_STATIC_ASSERT(P384_MUL_WSIZE == 5,
 #define P384_MUL_TABLE_SIZE     (P384_MUL_TWO_TO_WSIZE >> 1)
 #define P384_MUL_PUB_TABLE_SIZE (1 << (P384_MUL_PUB_WSIZE - 1))
 
-// p384_select_point_affine selects the |idx|-th affine point from
-// the given precomputed table and copies it to |out| in constant-time.
-static void p384_select_point_affine(p384_felem out[2],
-                                     size_t idx,
-                                     const p384_felem table[][2],
-                                     size_t table_size) {
-  OPENSSL_memset(out, 0, sizeof(p384_felem) * 2);
-  for (size_t i = 0; i < table_size; i++) {
-    p384_limb_t mismatch = i ^ idx;
-    p384_felem_cmovznz(out[0], mismatch, table[i][0], out[0]);
-    p384_felem_cmovznz(out[1], mismatch, table[i][1], out[1]);
-  }
-}
-
 // Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
@@ -526,135 +508,14 @@ static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
   p384_to_generic(&r->Z, res[2]);
 }
 
-// Include the precomputed table for the based point scalar multiplication.
-#include "p384_table.h"
-
 // Multiplication of the base point G of P-384 curve with the given scalar.
-// The product is computed with the Comb method using the precomputed table
-// |p384_g_pre_comp| from |p384_table.h| file and the regular-wNAF scalar
-// encoding.
-//
-// The |p384_g_pre_comp| table has 20 sub-tables each holding 16 points:
-//      0 :       [1]G,       [3]G,  ...,       [31]G
-//      1 :  [1*2^20]G,  [3*2^20]G,  ...,  [31*2^20]G
-//                         ...
-//      i : [1*2^20i]G, [3*2^20i]G,  ..., [31*2^20i]G
-//                         ...
-//     19 :   [2^380]G, [3*2^380]G,  ..., [31*2^380]G.
-// Computing the negation of a point P = (x, y) is relatively easy:
-//     -P = (x, -y).
-// So we may assume that for each sub-table we have 32 points instead of 16:
-//     [\pm 1*2^20i]G, [\pm 3*2^20i]G, ..., [\pm 31*2^20i]G.
-//
-// The 384-bit |scalar| is recoded (regular-wNAF encoding) into 77 signed
-// digits, each of length 5 bits, as explained in the
-// |p384_felem_mul_scalar_rwnaf| function. Namely,
-//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_76*2^380,
-// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
-// scalar we have that scalar = scalar', while in the case of an even
-// scalar we have that scalar = scalar' - 1.
-//
-// To compute the required product, [scalar]G, we may do the following.
-// Group the recoded digits of the scalar in 4 groups:
-//                                           |   corresponding multiples in
-//                   digits                  |   the recoded representation
-//    -------------------------------------------------------------------------
-//    (0): {s_0, s_4,  s_8, ..., s_72, s_76} |  { 2^0, 2^20, ..., 2^360, 2^380}
-//    (1): {s_1, s_5,  s_9, ..., s_73}       |  { 2^5, 2^25, ..., 2^365}
-//    (2): {s_2, s_6, s_10, ..., s_74}       |  {2^10, 2^30, ..., 2^370}
-//    (3): {s_3, s_7, s_11, ..., s_75}       |  {2^15, 2^35, ..., 2^375}
-//         corresponding sub-table lookup    |  {  T0,   T1, ...,   T18,   T19}
-//
-// The group (0) digits correspond precisely to the multiples of G that are
-// held in the 20 precomputed sub-tables, so we may simply read the appropriate
-// points from the sub-tables and sum them all up (negating if needed, i.e., if
-// a digit s_i is negative, we read the point corresponding to the abs(s_i) and
-// negate it before adding it to the sum).
-// The remaining three groups (1), (2), and (3), correspond to the multiples
-// of G from the sub-tables multiplied additionally by 2^5, 2^10, and 2^15,
-// respectively. Therefore, for these groups we may read the appropriate points
-// from the table, double them 5, 10, or 15 times, respectively, and add them
-// to the final result.
-//
-// To minimize the number of required doubling operations we process the digits
-// of the scalar from left to right. In other words, the algorithm is:
-//   1. Read the points corresponding to the group (3) digits from the table
-//      and add them to an accumulator.
-//   2. Double the accumulator 5 times.
-//   3. Repeat steps 1. and 2. for groups (2) and (1),
-//      and perform step 1. for group (0).
-//   4. If the scalar is even subtract G from the accumulator.
-//
-// Note: this function is constant-time.
 static void ec_GFp_nistp384_point_mul_base(const EC_GROUP *group,
                                            EC_JACOBIAN *r,
                                            const EC_SCALAR *scalar) {
+  p384_felem res[3] = {{0}, {0}, {0}};
 
-  p384_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}}, ftmp;
-  int16_t rnaf[P384_MUL_NWINDOWS] = {0};
-
-  // Recode the scalar.
-  scalar_rwnaf(rnaf, P384_MUL_WSIZE, scalar, 384);
-
-  // Process the 4 groups of digits starting from group (3) down to group (0).
-  for (int i = 3; i >= 0; i--) {
-    // Double |res| 5 times in each iteration, except in the first one.
-    for (int j = 0; i != 3 && j < P384_MUL_WSIZE; j++) {
-      p384_point_double(res[0], res[1], res[2], res[0], res[1], res[2]);
-    }
+  ec_nistp_scalar_mul_base(p384_methods(), res[0], res[1], res[2], scalar);
 
-    // Process the digits in the current group from the most to the least
-    // significant one (this is a requirement to ensure that the case of point
-    // doubling can't happen).
-    // For group (3) we process digits s_75 to s_3, for group (2) s_74 to s_2,
-    // group (1) s_73 to s_1, and for group (0) s_76 to s_0.
-    const size_t start_idx = ((P384_MUL_NWINDOWS - i - 1)/4)*4 + i;
-
-    for (int j = start_idx; j >= 0; j -= 4) {
-      // For each digit |d| in the current group read the corresponding point
-      // from the table and add it to |res|. If |d| is negative, negate
-      // the point before adding it to |res|.
-      int16_t d = rnaf[j];
-      // is_neg = (d < 0) ? 1 : 0
-      int16_t is_neg = (d >> 15) & 1;
-      // d = abs(d)
-      d = (d ^ -is_neg) + is_neg;
-
-      int16_t idx = d >> 1;
-
-      // Select the point to add, in constant time.
-      p384_select_point_affine(tmp, idx, p384_g_pre_comp[j / 4],
-                               P384_MUL_TABLE_SIZE);
-
-      // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
-      p384_felem_opp(ftmp, tmp[1]);
-      // Conditionally select y or -y depending on the sign of the digit |d|.
-      p384_felem_cmovznz(tmp[1], is_neg, tmp[1], ftmp);
-
-      // Add the point to the accumulator |res|.
-      // Note that the points in the pre-computed table are given with affine
-      // coordinates. The point addition function computes a sum of two points,
-      // either both given in projective, or one in projective and the other one
-      // in affine coordinates. The |mixed| flag indicates the latter option,
-      // in which case we set the third coordinate of the second point to one.
-      p384_point_add(res[0], res[1], res[2], res[0], res[1], res[2],
-                     1 /* mixed */, tmp[0], tmp[1], p384_felem_one);
-    }
-  }
-
-  // Conditionally subtract G if the scalar is even, in constant-time.
-  // First, compute |tmp| = |res| + (-G).
-  p384_felem_copy(tmp[0], p384_g_pre_comp[0][0][0]);
-  p384_felem_opp(tmp[1], p384_g_pre_comp[0][0][1]);
-  p384_point_add(tmp[0], tmp[1], tmp[2], res[0], res[1], res[2],
-                 1 /* mixed */, tmp[0], tmp[1], p384_felem_one);
-
-  // Select |res| or |tmp| based on the |scalar| parity.
-  p384_felem_cmovznz(res[0], scalar->words[0] & 1, tmp[0], res[0]);
-  p384_felem_cmovznz(res[1], scalar->words[0] & 1, tmp[1], res[1]);
-  p384_felem_cmovznz(res[2], scalar->words[0] & 1, tmp[2], res[2]);
-
-  // Copy the result to the output.
   p384_to_generic(&r->X, res[0]);
   p384_to_generic(&r->Y, res[1]);
   p384_to_generic(&r->Z, res[2]);
diff --git a/crypto/fipsmodule/ec/p521.c b/crypto/fipsmodule/ec/p521.c
index b1ed65dc7b..c35cde1d96 100644
--- a/crypto/fipsmodule/ec/p521.c
+++ b/crypto/fipsmodule/ec/p521.c
@@ -154,16 +154,6 @@ static void p521_felem_copy(p521_limb_t out[P521_NLIMBS],
   }
 }
 
-static void p521_felem_cmovznz(p521_limb_t out[P521_NLIMBS],
-                               p521_limb_t t,
-                               const p521_limb_t z[P521_NLIMBS],
-                               const p521_limb_t nz[P521_NLIMBS]) {
-  p521_limb_t mask = constant_time_is_zero_w(t);
-  for (size_t i = 0; i < P521_NLIMBS; i++) {
-    out[i] = constant_time_select_w(mask, z[i], nz[i]);
-  }
-}
-
 // NOTE: the input and output are in little-endian representation.
 static void p521_from_generic(p521_felem out, const EC_FELEM *in) {
 #ifdef OPENSSL_BIG_ENDIAN
@@ -288,6 +278,8 @@ static void p521_point_add(p521_felem x3, p521_felem y3, p521_felem z3,
   ec_nistp_point_add(p521_methods(), x3, y3, z3, x1, y1, z1, mixed, x2, y2, z2);
 }
 
+#include "p521_table.h"
+
 #if defined(EC_NISTP_USE_S2N_BIGNUM)
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
     out->felem_num_limbs = P521_NLIMBS;
@@ -298,8 +290,10 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
     out->felem_sqr = bignum_sqr_p521_selector;
     out->felem_neg = bignum_neg_p521;
     out->felem_nz  = p521_felem_nz;
+    out->felem_one = p521_felem_one;
     out->point_dbl = p521_point_double;
     out->point_add = p521_point_add;
+    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p521_g_pre_comp;
 }
 #else
 DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
@@ -311,8 +305,10 @@ DEFINE_METHOD_FUNCTION(ec_nistp_meth, p521_methods) {
     out->felem_sqr = fiat_secp521r1_carry_square;
     out->felem_neg = fiat_secp521r1_carry_opp;
     out->felem_nz  = p521_felem_nz;
+    out->felem_one = p521_felem_one;
     out->point_dbl = p521_point_double;
     out->point_add = p521_point_add;
+    out->scalar_mul_base_table = (const ec_nistp_felem_limb*) p521_g_pre_comp;
 }
 #endif
 
@@ -433,18 +429,6 @@ OPENSSL_STATIC_ASSERT(P521_MUL_WSIZE == 5,
 
 // p521_select_point_affine selects the |idx|-th affine point from
 // the given precomputed table and copies it to |out| in constant-time.
-static void p521_select_point_affine(p521_felem out[2],
-                                     size_t idx,
-                                     const p521_felem table[][2],
-                                     size_t table_size) {
-  OPENSSL_memset(out, 0, sizeof(p521_felem) * 2);
-  for (size_t i = 0; i < table_size; i++) {
-    p521_limb_t mismatch = i ^ idx;
-    p521_felem_cmovznz(out[0], mismatch, table[i][0], out[0]);
-    p521_felem_cmovznz(out[1], mismatch, table[i][1], out[1]);
-  }
-}
-
 // Multiplication of an arbitrary point by a scalar, r = [scalar]P.
 static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
                                       const EC_JACOBIAN *p,
@@ -463,135 +447,14 @@ static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
   p521_to_generic(&r->Z, res[2]);
 }
 
-// Include the precomputed table for the based point scalar multiplication.
-#include "p521_table.h"
-
 // Multiplication of the base point G of P-521 curve with the given scalar.
-// The product is computed with the Comb method using the precomputed table
-// |p521_g_pre_comp| from |p521_table.h| file and the regular-wNAF scalar
-// encoding.
-//
-// The |p521_g_pre_comp| table has 27 sub-tables each holding 16 points:
-//      0 :       [1]G,       [3]G,  ...,       [31]G
-//      1 :  [1*2^20]G,  [3*2^20]G,  ...,  [31*2^20]G
-//                         ...
-//      i : [1*2^20i]G, [3*2^20i]G,  ..., [31*2^20i]G
-//                         ...
-//     26 :   [2^520]G, [3*2^520]G,  ..., [31*2^520]G
-// Computing the negation of a point P = (x, y) is relatively easy:
-//     -P = (x, -y).
-// So we may assume that for each sub-table we have 32 points instead of 16:
-//     [\pm 1*2^20i]G, [\pm 3*2^20i]G, ..., [\pm 31*2^20i]G.
-//
-// The 521-bit |scalar| is recoded (regular-wNAF encoding) into 105 signed
-// digits, each of length 5 bits, as explained in the
-// |p521_felem_mul_scalar_rwnaf| function. Namely,
-//     scalar' = s_0 + s_1*2^5 + s_2*2^10 + ... + s_104*2^520,
-// where digits s_i are in [\pm 1, \pm 3, ..., \pm 31]. Note that for an odd
-// scalar we have that scalar = scalar', while in the case of an even
-// scalar we have that scalar = scalar' - 1.
-//
-// To compute the required product, [scalar]G, we may do the following.
-// Group the recoded digits of the scalar in 4 groups:
-//                                            |   corresponding multiples in
-//                  digits                    |   the recoded representation
-//   -------------------------------------------------------------------------
-//   (0): {s_0, s_4,  s_8, ..., s_100, s_104} |  { 2^0, 2^20, ..., 2^500, 2^520}
-//   (1): {s_1, s_5,  s_9, ..., s_101}        |  { 2^5, 2^25, ..., 2^505}
-//   (2): {s_2, s_6, s_10, ..., s_102}        |  {2^10, 2^30, ..., 2^510}
-//   (3): {s_3, s_7, s_11, ..., s_103}        |  {2^15, 2^35, ..., 2^515}
-//        corresponding sub-table lookup      |  {  T0,   T1, ...,   T25,   T26}
-//
-// The group (0) digits correspond precisely to the multiples of G that are
-// held in the 27 precomputed sub-tables, so we may simply read the appropriate
-// points from the sub-tables and sum them all up (negating if needed, i.e., if
-// a digit s_i is negative, we read the point corresponding to the abs(s_i) and
-// negate it before adding it to the sum).
-// The remaining three groups (1), (2), and (3), correspond to the multiples
-// of G from the sub-tables multiplied additionally by 2^5, 2^10, and 2^15,
-// respectively. Therefore, for these groups we may read the appropriate points
-// from the table, double them 5, 10, or 15 times, respectively, and add them
-// to the final result.
-//
-// To minimize the number of required doubling operations we process the digits
-// of the scalar from left to right. In other words, the algorithm is:
-//   1. Read the points corresponding to the group (3) digits from the table
-//      and add them to an accumulator.
-//   2. Double the accumulator 5 times.
-//   3. Repeat steps 1. and 2. for groups (2) and (1),
-//      and perform step 1. for group (0).
-//   4. If the scalar is even subtract G from the accumulator.
-//
-// Note: this function is constant-time.
 static void ec_GFp_nistp521_point_mul_base(const EC_GROUP *group,
                                            EC_JACOBIAN *r,
                                            const EC_SCALAR *scalar) {
+  p521_felem res[3] = {{0}, {0}, {0}};
 
-  p521_felem res[3] = {{0}, {0}, {0}}, tmp[3] = {{0}, {0}, {0}}, ftmp;
-  int16_t rnaf[P521_MUL_NWINDOWS] = {0};
-
-  // Recode the scalar.
-  scalar_rwnaf(rnaf, P521_MUL_WSIZE, scalar, 521);
-
-  // Process the 4 groups of digits starting from group (3) down to group (0).
-  for (int i = 3; i >= 0; i--) {
-    // Double |res| 5 times in each iteration, except in the first one.
-    for (size_t j = 0; i != 3 && j < P521_MUL_WSIZE; j++) {
-      p521_point_double(res[0], res[1], res[2], res[0], res[1], res[2]);
-    }
-
-    // Process the digits in the current group from the most to the least
-    // significant one (this is a requirement to ensure that the case of point
-    // doubling can't happen).
-    // For group (3) we process digits s_103 to s_3, for group (2) s_102 to s_2,
-    // group (1) s_101 to s_1, and for group (0) s_104 to s_0.
-    const size_t start_idx = ((P521_MUL_NWINDOWS - i - 1)/4)*4 + i;
-
-    for (int j = start_idx; j >= 0; j -= 4) {
-      // For each digit |d| in the current group read the corresponding point
-      // from the table and add it to |res|. If |d| is negative, negate
-      // the point before adding it to |res|.
-      int16_t d = rnaf[j];
-      // is_neg = (d < 0) ? 1 : 0
-      int16_t is_neg = (d >> 15) & 1;
-      // d = abs(d)
-      d = (d ^ -is_neg) + is_neg;
-
-      int16_t idx = d >> 1;
-
-      // Select the point to add, in constant time.
-      p521_select_point_affine(tmp, idx, p521_g_pre_comp[j / 4],
-                               P521_MUL_TABLE_SIZE);
-
-      // Negate y coordinate of the point tmp = (x, y); ftmp = -y.
-      p521_felem_opp(ftmp, tmp[1]);
-      // Conditionally select y or -y depending on the sign of the digit |d|.
-      p521_felem_cmovznz(tmp[1], is_neg, tmp[1], ftmp);
-
-      // Add the point to the accumulator |res|.
-      // Note that the points in the pre-computed table are given with affine
-      // coordinates. The point addition function computes a sum of two points,
-      // either both given in projective, or one in projective and the other one
-      // in affine coordinates. The |mixed| flag indicates the latter option,
-      // in which case we set the third coordinate of the second point to one.
-      p521_point_add(res[0], res[1], res[2], res[0], res[1], res[2],
-                     1 /* mixed */, tmp[0], tmp[1], p521_felem_one);
-    }
-  }
+  ec_nistp_scalar_mul_base(p521_methods(), res[0], res[1], res[2], scalar);
 
-  // Conditionally subtract G if the scalar is even, in constant-time.
-  // First, compute |tmp| = |res| + (-G).
-  p521_felem_copy(tmp[0], p521_g_pre_comp[0][0][0]);
-  p521_felem_opp(tmp[1], p521_g_pre_comp[0][0][1]);
-  p521_point_add(tmp[0], tmp[1], tmp[2], res[0], res[1], res[2],
-                 1 /* mixed */, tmp[0], tmp[1], p521_felem_one);
-
-  // Select |res| or |tmp| based on the |scalar| parity.
-  p521_felem_cmovznz(res[0], scalar->words[0] & 1, tmp[0], res[0]);
-  p521_felem_cmovznz(res[1], scalar->words[0] & 1, tmp[1], res[1]);
-  p521_felem_cmovznz(res[2], scalar->words[0] & 1, tmp[2], res[2]);
-
-  // Copy the result to the output.
   p521_to_generic(&r->X, res[0]);
   p521_to_generic(&r->Y, res[1]);
   p521_to_generic(&r->Z, res[2]);