Skip to content

Commit

Permalink
[EC] P-256/384/521 s2n-bignum scalar multiplication (#2036)
Browse files Browse the repository at this point in the history
For curves P-256/384/521 we use s2n-bignum implementation
of scalar multiplication of an arbitrary point. This gives the following
performance improvements (measurements in ops/s):
```
__Apple M1__| before |  after | speedup |
P-256 MUL   |  27871 |  31607 |  1.13x  |
P-256 ECDH  |  20804 |  22778 |  1.11x  |
P-384 MUL   |   7245 |   8618 |  1.19x  |
P-384 ECDH  |   5367 |   5986 |  1.11x  |
P-521 MUL   |   5040 |   5806 |  1.15x  |
P-521 ECDH  |   3696 |   4053 |  1.10x  |

____Intel___| before |  after | speedup |
P-256 MUL   |  21913 |  25650 |  1.17x  |
P-256 ECDH  |  17188 |  19453 |  1.13x  |
P-384 MUL   |   6554 |   7691 |  1.17x  |
P-384 ECDH  |   4731 |   5321 |  1.12x  |
P-521 MUL   |   4400 |   5151 |  1.17x  |
P-521 ECDH  |   3192 |   3514 |  1.10x  |
```
where Apple M1 is a M1 based macbook laptop, and
Intel is Intel(R) Xeon(R) Platinum 8488C.
  • Loading branch information
dkostic authored Dec 13, 2024
1 parent 850af98 commit 02ea4c4
Show file tree
Hide file tree
Showing 7 changed files with 621 additions and 600 deletions.
7 changes: 7 additions & 0 deletions crypto/fipsmodule/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
set(
S2N_BIGNUM_ASM_SOURCES

p256/p256_montjscalarmul.S
p256/p256_montjscalarmul_alt.S

p384/bignum_add_p384.S
p384/bignum_sub_p384.S
p384/bignum_neg_p384.S
Expand All @@ -218,6 +221,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
p384/bignum_littleendian_6.S
p384/p384_montjdouble.S
p384/p384_montjdouble_alt.S
p384/p384_montjscalarmul.S
p384/p384_montjscalarmul_alt.S

p521/bignum_add_p521.S
p521/bignum_sub_p521.S
Expand All @@ -230,6 +235,8 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR
p521/bignum_fromlebytes_p521.S
p521/p521_jdouble.S
p521/p521_jdouble_alt.S
p521/p521_jscalarmul.S
p521/p521_jscalarmul_alt.S

curve25519/bignum_mod_n25519.S
curve25519/bignum_neg_p25519.S
Expand Down
13 changes: 13 additions & 0 deletions crypto/fipsmodule/ec/p256-nistz.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@
#include "../../internal.h"
#include "internal.h"
#include "p256-nistz.h"
#include "ec_nistp.h"

#if defined(EC_NISTP_USE_S2N_BIGNUM)
#include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h"
#endif

#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \
Expand Down Expand Up @@ -304,6 +309,13 @@ static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) {
static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *p,
const EC_SCALAR *scalar) {
#if defined(EC_NISTP_USE_S2N_BIGNUM)
ec_nistp_felem_limb in[P256_LIMBS * 3];
ec_nistp_felem_limb out[P256_LIMBS * 3];
ec_nistp_coordinates_to_point(in, p->X.words, p->Y.words, p->Z.words, P256_LIMBS);
p256_montjscalarmul_selector(out, scalar->words, in);
ec_nistp_point_to_coordinates(r->X.words, r->Y.words, r->Z.words, out, P256_LIMBS);
#else
stack_align_type buffer_out[32 + sizeof(P256_POINT)];
P256_POINT *aligned_out = (P256_POINT *) align_pointer(buffer_out, 32);
ecp_nistz256_windowed_mul(group, aligned_out, p, scalar);
Expand All @@ -312,6 +324,7 @@ static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
OPENSSL_memcpy(r->X.words, aligned_out->X, P256_LIMBS * sizeof(BN_ULONG));
OPENSSL_memcpy(r->Y.words, aligned_out->Y, P256_LIMBS * sizeof(BN_ULONG));
OPENSSL_memcpy(r->Z.words, aligned_out->Z, P256_LIMBS * sizeof(BN_ULONG));
#endif
}

static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r,
Expand Down
37 changes: 4 additions & 33 deletions crypto/fipsmodule/ec/p384.c
Original file line number Diff line number Diff line change
Expand Up @@ -438,39 +438,6 @@ static int ec_GFp_nistp384_cmp_x_coordinate(const EC_GROUP *group,
return 0;
}

// ----------------------------------------------------------------------------
// SCALAR MULTIPLICATION OPERATIONS
// ----------------------------------------------------------------------------
//
// The method for computing scalar products in functions:
// - |ec_GFp_nistp384_point_mul|,
// - |ec_GFp_nistp384_point_mul_base|,
// - |ec_GFp_nistp384_point_mul_public|,
// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
//
// One difference from the processing in the ECCKiila project is the order of
// the digit processing in |ec_GFp_nistp384_point_mul_base|, where we end the
// processing with the least significant digit to be able to apply the
// analysis results detailed at the bottom of this file. In
// |ec_GFp_nistp384_point_mul_base| and |ec_GFp_nistp384_point_mul|, we
// considered using window size 7 based on that same analysis. However, the
// table size and performance measurements were more preferable for window
// size 5. The potential issue with different window sizes is that for some
// sizes, a scalar can be found such that a case of point doubling instead of
// point addition happens in the scalar multiplication. This would make
// the multiplication non constant-time. To the best of our knowledge this
// timing leak is not an exploitable issue because the only scalar for which
// the leak can happen is already known by the attacker. This is also provided
// that this recoding and window size are only used with ECDH and ECDSA
// protocols. Any other use would need to be analyzed to determine whether it is
// secure and the user should be aware of this side channel of a particular
// scalar value.
//
// OpenSSL has a similar analysis for P-521 implementation:
// https://github.com/openssl/openssl/blob/e9492d1cecf459261f1f5ac0eb03e9c631600537/crypto/ec/ecp_nistp521.c#L1318
//
// For detailed analysis of different window sizes see the bottom of this file.

// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *p,
Expand All @@ -482,7 +449,11 @@ static void ec_GFp_nistp384_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
p384_from_generic(tmp[1], &p->Y);
p384_from_generic(tmp[2], &p->Z);

#if defined(EC_NISTP_USE_S2N_BIGNUM)
p384_montjscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
#else
ec_nistp_scalar_mul(p384_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
#endif

p384_to_generic(&r->X, res[0]);
p384_to_generic(&r->Y, res[1]);
Expand Down
35 changes: 4 additions & 31 deletions crypto/fipsmodule/ec/p521.c
Original file line number Diff line number Diff line change
Expand Up @@ -377,37 +377,6 @@ static void ec_GFp_nistp521_dbl(const EC_GROUP *group, EC_JACOBIAN *r,
p521_to_generic(&r->Z, z);
}

// ----------------------------------------------------------------------------
// SCALAR MULTIPLICATION OPERATIONS
// ----------------------------------------------------------------------------
//
// The method for computing scalar products in functions:
// - |ec_GFp_nistp521_point_mul|,
// - |ec_GFp_nistp521_point_mul_base|,
// - |ec_GFp_nistp521_point_mul_public|,
// is adapted from ECCKiila project (https://arxiv.org/abs/2007.11481).
// The main difference is that we use a window of size 7 instead of 5 for the
// first two functions. The potential issue with window sizes is that for some
// sizes a scalar can be found such that a case of point doubling instead of
// point addition happens in the scalar multiplication. This would make the
// multiplication non constant-time. Therefore, such window sizes have to be
// avoided. The windows size of 7 is chosen based on analysis analogous to
// the one in |ec_GFp_nistp_recode_scalar_bits| function in |util.c| file.
// See the analysis at the bottom of this file.
//
// Moreover, the order in which the digits of the scalar are processed in
// |ec_GFp_nistp521_point_mul_base| is different from the ECCKiila project, to
// ensure that the least significant digit is processed last which together
// with the window size 7 guarantees constant-time execution of the function.
//
// Another difference is that in |ec_GFp_nistp521_point_mul_public| function we
// use window size 5 for the public point and 7 for the base point. Here it is
// ok to use window of size 5 since the scalar is public and therefore the
// function doesn't have to be constant-time.
//
// The precomputed table of base point multiples is generated by the code in
// |make_tables.go| script.

// Multiplication of an arbitrary point by a scalar, r = [scalar]P.
static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
const EC_JACOBIAN *p,
Expand All @@ -419,7 +388,11 @@ static void ec_GFp_nistp521_point_mul(const EC_GROUP *group, EC_JACOBIAN *r,
p521_from_generic(tmp[1], &p->Y);
p521_from_generic(tmp[2], &p->Z);

#if defined(EC_NISTP_USE_S2N_BIGNUM)
p521_jscalarmul_selector((uint64_t*)res, scalar->words, (uint64_t*)tmp);
#else
ec_nistp_scalar_mul(p521_methods(), res[0], res[1], res[2], tmp[0], tmp[1], tmp[2], scalar);
#endif

p521_to_generic(&r->X, res[0]);
p521_to_generic(&r->Y, res[1]);
Expand Down
24 changes: 22 additions & 2 deletions third_party/s2n-bignum/include/s2n-bignum_aws-lc.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ static inline uint8_t use_s2n_bignum_alt(void) {
}
#endif

extern void p256_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
extern void p256_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]);
static inline void p256_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 12], const uint64_t scalar[S2N_BIGNUM_STATIC 4], uint64_t point[S2N_BIGNUM_STATIC 12]) {
if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); }
else { p256_montjscalarmul(res, scalar, point); }
}

// Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced
// Inputs x[6], y[6]; output z[6]
extern void bignum_add_p384(uint64_t z[S2N_BIGNUM_STATIC 6], const uint64_t x[S2N_BIGNUM_STATIC 6], const uint64_t y[S2N_BIGNUM_STATIC 6]);
Expand Down Expand Up @@ -110,6 +117,13 @@ static inline void p384_montjdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 18],u
else { p384_montjdouble(p3, p1); }
}

extern void p384_montjscalarmul(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
extern void p384_montjscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]);
static inline void p384_montjscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 18], const uint64_t scalar[S2N_BIGNUM_STATIC 6], uint64_t point[S2N_BIGNUM_STATIC 18]) {
if (use_s2n_bignum_alt()) { p384_montjscalarmul_alt(res, scalar, point); }
else { p384_montjscalarmul(res, scalar, point); }
}

// Convert 6-digit (384-bit) bignum from little-endian form
// Input x[6]; output z[6]
extern void bignum_fromlebytes_6(uint64_t z[S2N_BIGNUM_STATIC 6], const uint8_t x[S2N_BIGNUM_STATIC 48]);
Expand Down Expand Up @@ -158,12 +172,18 @@ extern void bignum_fromlebytes_p521(uint64_t z[S2N_BIGNUM_STATIC 9], const uint8
// Convert 9-digit 528-bit bignum to little-endian bytes
extern void bignum_tolebytes_p521(uint8_t z[S2N_BIGNUM_STATIC 66], const uint64_t x[S2N_BIGNUM_STATIC 9]);

extern void p521_jdouble(uint64_t p3[static 27],uint64_t p1[static 27]);
extern void p521_jdouble_alt(uint64_t p3[static 27],uint64_t p1[static 27]);
extern void p521_jdouble(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
extern void p521_jdouble_alt(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]);
static inline void p521_jdouble_selector(uint64_t p3[S2N_BIGNUM_STATIC 27],uint64_t p1[S2N_BIGNUM_STATIC 27]) {
if (use_s2n_bignum_alt()) { p521_jdouble_alt(p3, p1); }
else { p521_jdouble(p3, p1); }
}
extern void p521_jscalarmul(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
extern void p521_jscalarmul_alt(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]);
static inline void p521_jscalarmul_selector(uint64_t res[S2N_BIGNUM_STATIC 27], const uint64_t scalar[S2N_BIGNUM_STATIC 9], const uint64_t point[S2N_BIGNUM_STATIC 27]) {
if (use_s2n_bignum_alt()) { p521_jscalarmul_alt(res, scalar, point); }
else { p521_jscalarmul(res, scalar, point); }
}

// curve25519_x25519_byte and curve25519_x25519_byte_alt computes the x25519
// function specified in https://www.rfc-editor.org/rfc/rfc7748. |scalar| is the
Expand Down
10 changes: 7 additions & 3 deletions util/fipstools/delocate/delocate.peg
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,19 @@ ARMPostincrement <- '!'
BaseIndexScale <- '(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)? )? ')'
Operator <- [+\-]
OffsetOperator <- '+' / '-' / '*'
# s2n-bignum code has a lot of different and complex ways to compute an offset.
# For example, (7*72)+(3*72)*(5-1)+8+0*72. We define S2nBignumHelper in an attempt
# to simplofy the expressions for Offset.
S2nBignumHelper <- '(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' WS? OffsetOperator? WS?
Offset <- '+'? '-'? (("0b" [01]+) /
("0x" [[0-9A-F]]+) /
([0-9]+ WS OffsetOperator [0-9]+ /
[0-9]+ ( OffsetOperator '(' [0-9]+ OffsetOperator [0-9]+ ')' )? /
[0-9]+ ( OffsetOperator [0-9]+ OffsetOperator [0-9]+ )? /
[0-9]+ ( OffsetOperator [0-9]+ )? /
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ OffsetOperator [0-9]+ /
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ !'x' /
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' /
S2nBignumHelper S2nBignumHelper (S2nBignumHelper ([0-9]+ OffsetOperator)? [0-9]+ OffsetOperator)? [0-9]+ /
S2nBignumHelper [0-9]+ ((WS? OffsetOperator [0-9]+ (WS? OffsetOperator [0-9]+)?) / (!'x')) /
S2nBignumHelper /
'(' [0-9]+ WS? OffsetOperator WS? [0-9]+ WS? OffsetOperator WS? [0-9]+')')![[A-Z]]
)
Section <- [[A-Z@]]+
Expand Down
Loading

0 comments on commit 02ea4c4

Please sign in to comment.