From 4ff0c4547f99706aeb086993d1303c03c46ac699 Mon Sep 17 00:00:00 2001 From: Emman Date: Fri, 24 Dec 2021 13:13:11 +0800 Subject: [PATCH] optimize sm2 p256 amd64 implementation --- sm2/p256_asm_amd64.s | 739 +++++++++++++++++-------------------------- sm2/p256_asm_test.go | 91 ++++++ 2 files changed, 386 insertions(+), 444 deletions(-) diff --git a/sm2/p256_asm_amd64.s b/sm2/p256_asm_amd64.s index 3bbcce5c..e1fccc75 100644 --- a/sm2/p256_asm_amd64.s +++ b/sm2/p256_asm_amd64.s @@ -246,94 +246,66 @@ sqrLoop: ADCQ DX, t1 MOVQ t1, x_ptr // First reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc0 + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc0, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, acc0 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc0 // Second reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc1 + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc1 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc1 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, acc1 + ADCQ $0, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc0 + SBBQ DX, acc1 // Third reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc2 + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc2, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc2 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc2 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, acc2 + ADCQ $0, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + SUBQ AX, acc3 + SBBQ DX, acc0 + SBBQ AX, acc1 + SBBQ DX, acc2 // Last reduction step XORQ t0, t0 - MOVQ p256p<>+0x08(SB), AX - MULQ acc3 + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc3, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc3 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc3 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, acc3 + ADCQ $0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + SUBQ AX, acc0 + SBBQ DX, acc1 + SBBQ AX, acc2 + SBBQ DX, acc3 // Add bits [511:256] of the sqr result ADCQ acc4, acc0 @@ -400,27 +372,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 MOVQ DX, acc4 XORQ acc5, acc5 // First reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc0 + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc0, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 ADCQ $0, acc5 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 + SBBQ $0, acc5 XORQ acc0, acc0 // x * y[1] @@ -456,27 +423,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc5 ADCQ $0, acc0 // Second reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc1 + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc1 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc1 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 + ADCQ $0, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 ADCQ $0, acc0 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 + SBBQ $0, acc0 XORQ acc1, acc1 // x * y[2] @@ -512,27 +474,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc0 ADCQ $0, acc1 // Third reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc2 + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc2, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc2 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc2 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 + ADCQ $0, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 ADCQ $0, acc1 + + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 + SBBQ $0, acc1 XORQ acc2, acc2 // x * y[3] MOVQ (8*3)(y_ptr), t0 @@ -567,27 +524,22 @@ TEXT ·p256Mul(SB),NOSPLIT,$0 ADCQ DX, acc1 ADCQ $0, acc2 // Last reduction step - MOVQ p256p<>+0x08(SB), AX - MULQ acc3 + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc3, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc3 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc3 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 ADCQ $0, acc2 + + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 + SBBQ $0, acc2 // Copy result [255:0] MOVQ acc4, x_ptr MOVQ acc5, acc3 @@ -625,93 +577,69 @@ TEXT ·p256FromMont(SB),NOSPLIT,$0 // Only reduce, no multiplications are needed // First stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc0 + MOVQ acc0, AX + MOVQ acc0, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc0, acc1 - ADCQ $0, DX - ADDQ AX, acc1 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc0 - ADDQ t1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc0 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ DX, acc4 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ acc0, acc4 + + SUBQ AX, acc1 + SBBQ DX, acc2 + SBBQ AX, acc3 + SBBQ DX, acc4 XORQ acc5, acc5 // Second stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc1 + MOVQ acc1, AX + MOVQ acc1, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc1, acc2 - ADCQ $0, DX - ADDQ AX, acc2 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc1 - ADDQ t1, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc1 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ DX, acc5 + ADCQ $0, acc3 + ADCQ $0, acc4 + ADCQ acc1, acc5 + + SUBQ AX, acc2 + SBBQ DX, acc3 + SBBQ AX, acc4 + SBBQ DX, acc5 XORQ acc0, acc0 // Third stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc2 + MOVQ acc2, AX + MOVQ acc2, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc2, acc3 - ADCQ $0, DX - ADDQ AX, acc3 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc2 - ADDQ t1, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc2 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ DX, acc0 + ADCQ $0, acc4 + ADCQ $0, acc5 + ADCQ acc2, acc0 + + SUBQ AX, acc3 + SBBQ DX, acc4 + SBBQ AX, acc5 + SBBQ DX, acc0 XORQ acc1, acc1 // Last stage - MOVQ p256p<>+0x08(SB), AX - MULQ acc3 + MOVQ acc3, AX + MOVQ acc3, DX + SHLQ $32, AX + SHRQ $32, DX + ADDQ acc3, acc4 - ADCQ $0, DX - ADDQ AX, acc4 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x010(SB), AX - MULQ acc3 - ADDQ t1, acc5 - ADCQ $0, DX - ADDQ AX, acc5 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ p256p<>+0x018(SB), AX - MULQ acc3 - ADDQ t1, acc0 - ADCQ $0, DX - ADDQ AX, acc0 - ADCQ DX, acc1 + ADCQ $0, acc5 + ADCQ $0, acc0 + ADCQ acc3, acc1 + + SUBQ AX, acc4 + SBBQ DX, acc5 + SBBQ AX, acc0 + SBBQ DX, acc1 MOVQ acc4, x_ptr MOVQ acc5, acc3 @@ -1249,38 +1177,33 @@ ordSqrLoop: // First reduction step MOVQ acc0, AX MULQ p256ordK0<>(SB) - MOVQ AX, t0 + MOVQ AX, t0 // Y = t0 = (k0 * acc0) mod 2^64 MOVQ p256ord<>+0x00(SB), AX MULQ t0 - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 + ADDQ AX, acc0 // (carry1, acc0) = acc0 + t0 * ord0 + ADCQ $0, DX // DX = carry1 + H(t0 * ord0) + MOVQ DX, t1 // t1 = carry1 + H(t0 * ord0) + MOVQ t0, acc0 MOVQ p256ord<>+0x08(SB), AX MULQ t0 - ADDQ t1, acc1 - ADCQ $0, DX - ADDQ AX, acc1 + ADDQ t1, acc1 // (carry2, acc1) = acc1 + t1 + ADCQ $0, DX // DX = carry2 + H(t0*ord1) - MOVQ t0, t1 + ADDQ AX, acc1 // (carry3, acc1) = acc1 + t1 + L(t0*ord1) ADCQ DX, acc2 - ADCQ $0, t1 - SUBQ t0, acc2 - SBBQ $0, t1 + ADCQ $0, acc3 + ADCQ $0, acc0 MOVQ t0, AX MOVQ t0, DX - MOVQ t0, acc0 SHLQ $32, AX SHRQ $32, DX - ADDQ t1, acc3 - ADCQ $0, acc0 - SUBQ AX, acc3 + SUBQ t0, acc2 + SBBQ AX, acc3 SBBQ DX, acc0 - SUBQ t0, acc3 - SBBQ $0, acc0 // Second reduction step MOVQ acc1, AX MULQ p256ordK0<>(SB) @@ -1291,31 +1214,26 @@ ordSqrLoop: ADDQ AX, acc1 ADCQ $0, DX MOVQ DX, t1 + MOVQ t0, acc1 MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc2 ADCQ $0, DX - ADDQ AX, acc2 - MOVQ t0, t1 + ADDQ AX, acc2 ADCQ DX, acc3 - ADCQ $0, t1 - SUBQ t0, acc3 - SBBQ $0, t1 + ADCQ $0, acc0 + ADCQ $0, acc1 MOVQ t0, AX MOVQ t0, DX - MOVQ t0, acc1 SHLQ $32, AX SHRQ $32, DX - ADDQ t1, acc0 - ADCQ $0, acc1 - SUBQ AX, acc0 + SUBQ t0, acc3 + SBBQ AX, acc0 SBBQ DX, acc1 - SUBQ t0, acc0 - SBBQ $0, acc1 // Third reduction step MOVQ acc2, AX MULQ p256ordK0<>(SB) @@ -1326,31 +1244,26 @@ ordSqrLoop: ADDQ AX, acc2 ADCQ $0, DX MOVQ DX, t1 + MOVQ t0, acc2 MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc3 ADCQ $0, DX - ADDQ AX, acc3 - MOVQ t0, t1 + ADDQ AX, acc3 ADCQ DX, acc0 - ADCQ $0, t1 - SUBQ t0, acc0 - SBBQ $0, t1 + ADCQ $0, acc1 + ADCQ $0, acc2 MOVQ t0, AX MOVQ t0, DX - MOVQ t0, acc2 SHLQ $32, AX SHRQ $32, DX - ADDQ t1, acc1 - ADCQ $0, acc2 - SUBQ AX, acc1 + SUBQ t0, acc0 + SBBQ AX, acc1 SBBQ DX, acc2 - SUBQ t0, acc1 - SBBQ $0, acc2 // Last reduction step MOVQ acc3, AX MULQ p256ordK0<>(SB) @@ -1361,33 +1274,27 @@ ordSqrLoop: ADDQ AX, acc3 ADCQ $0, DX MOVQ DX, t1 + MOVQ t0, acc3 MOVQ p256ord<>+0x08(SB), AX MULQ t0 ADDQ t1, acc0 ADCQ $0, DX - ADDQ AX, acc0 - ADCQ $0, DX - MOVQ DX, t1 - MOVQ t0, t1 + ADDQ AX, acc0 ADCQ DX, acc1 - ADCQ $0, t1 - SUBQ t0, acc1 - SBBQ $0, t1 + ADCQ $0, acc2 + ADCQ $0, acc3 MOVQ t0, AX MOVQ t0, DX - MOVQ t0, acc3 SHLQ $32, AX SHRQ $32, DX - ADDQ t1, acc2 - ADCQ $0, acc3 - SUBQ AX, acc2 + SUBQ t0, acc1 + SBBQ AX, acc2 SBBQ DX, acc3 - SUBQ t0, acc2 - SBBQ $0, acc3 + XORQ t0, t0 // Add bits [511:256] of the sqr result ADCQ acc4, acc0 @@ -1591,93 +1498,65 @@ TEXT sm2P256MulInternal(SB),NOSPLIT,$0 ADCQ $0, mul1 MOVQ mul1, acc7 // First reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc0 + MOVQ acc0, mul0 + MOVQ acc0, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc0, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc0 - ADDQ hlp, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc0 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + + SUBQ mul0, acc1 + SBBQ mul1, acc2 + SBBQ mul0, acc3 + SBBQ mul1, acc0 // Second reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc1 + MOVQ acc1, mul0 + MOVQ acc1, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc1, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc1 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc1 - ADDQ hlp, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, acc1 + ADCQ $0, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + SUBQ mul0, acc2 + SBBQ mul1, acc3 + SBBQ mul0, acc0 + SBBQ mul1, acc1 // Third reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc2 + MOVQ acc2, mul0 + MOVQ acc2, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc2, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc2 - ADDQ hlp, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc2 - ADDQ hlp, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, acc2 + ADCQ $0, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + SUBQ mul0, acc3 + SBBQ mul1, acc0 + SBBQ mul0, acc1 + SBBQ mul1, acc2 // Last reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc3 + MOVQ acc3, mul0 + MOVQ acc3, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc3, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc3 - ADDQ hlp, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc3 - ADDQ hlp, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 + ADCQ $0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + SUBQ mul0, acc0 + SBBQ mul1, acc1 + SBBQ mul0, acc2 + SBBQ mul1, acc3 MOVQ $0, BP // Add bits [511:256] of the result ADCQ acc0, acc4 @@ -1777,93 +1656,65 @@ TEXT sm2P256SqrInternal(SB),NOSPLIT,$0 ADCQ mul0, t2 ADCQ DX, t3 // First reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc0 + MOVQ acc0, mul0 + MOVQ acc0, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc0, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc0 - ADDQ hlp, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc0 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, acc0 + ADCQ $0, acc2 + ADCQ $0, acc3 + ADCQ $0, acc0 + + SUBQ mul0, acc1 + SBBQ mul1, acc2 + SBBQ mul0, acc3 + SBBQ mul1, acc0 // Second reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc1 + MOVQ acc1, mul0 + MOVQ acc1, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc1, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc1 - ADDQ hlp, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc1 - ADDQ hlp, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, acc1 + ADCQ $0, acc3 + ADCQ $0, acc0 + ADCQ $0, acc1 + + SUBQ mul0, acc2 + SBBQ mul1, acc3 + SBBQ mul0, acc0 + SBBQ mul1, acc1 // Third reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc2 + MOVQ acc2, mul0 + MOVQ acc2, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc2, acc3 - ADCQ $0, mul1 - ADDQ mul0, acc3 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc2 - ADDQ hlp, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc2 - ADDQ hlp, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, acc2 + ADCQ $0, acc0 + ADCQ $0, acc1 + ADCQ $0, acc2 + + SUBQ mul0, acc3 + SBBQ mul1, acc0 + SBBQ mul0, acc1 + SBBQ mul1, acc2 // Last reduction step - MOVQ p256p<>+0x08(SB), mul0 - MULQ acc3 + MOVQ acc3, mul0 + MOVQ acc3, mul1 + SHLQ $32, mul0 + SHRQ $32, mul1 + ADDQ acc3, acc0 - ADCQ $0, mul1 - ADDQ mul0, acc0 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x010(SB), mul0 - MULQ acc3 - ADDQ hlp, acc1 - ADCQ $0, mul1 - ADDQ mul0, acc1 - ADCQ $0, mul1 - MOVQ mul1, hlp - MOVQ p256p<>+0x018(SB), mul0 - MULQ acc3 - ADDQ hlp, acc2 - ADCQ $0, mul1 - ADDQ mul0, acc2 - ADCQ $0, mul1 - MOVQ mul1, acc3 + ADCQ $0, acc1 + ADCQ $0, acc2 + ADCQ $0, acc3 + + SUBQ mul0, acc0 + SBBQ mul1, acc1 + SBBQ mul0, acc2 + SBBQ mul1, acc3 MOVQ $0, BP // Add bits [511:256] of the result ADCQ acc0, t0 diff --git a/sm2/p256_asm_test.go b/sm2/p256_asm_test.go index 4b099da6..a7ba3a04 100644 --- a/sm2/p256_asm_test.go +++ b/sm2/p256_asm_test.go @@ -7,8 +7,10 @@ import ( "crypto/rand" "encoding/hex" "fmt" + "io" "math/big" "testing" + "time" ) func toBigInt(in []uint64) *big.Int { @@ -115,6 +117,95 @@ func Test_p256Mul(t *testing.T) { } } +func p256SqrTest(t *testing.T, x, p, r *big.Int) { + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + ax := make([]uint64, 4) + res := make([]uint64, 4) + res2 := make([]uint64, 4) + fromBig(ax, x1) + p256Sqr(res2, ax, 1) + p256FromMont(res, res2) + resInt := toBigInt(res) + + expected := new(big.Int).Mul(x, x) + expected = expected.Mod(expected, p) + if resInt.Cmp(expected) != 0 { + t.FailNow() + } +} + +func TestFuzzyP256Sqr(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + var timeout *time.Timer + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + for { + select { + case <-timeout.C: + return + default: + } + io.ReadFull(rand.Reader, scalar1[:]) + x := new(big.Int).SetBytes(scalar1[:]) + p256SqrTest(t, x, p, r) + } +} + +func p256MulTest(t *testing.T, x, y, p, r *big.Int) { + x1 := new(big.Int).Mul(x, r) + x1 = x1.Mod(x1, p) + y1 := new(big.Int).Mul(y, r) + y1 = y1.Mod(y1, p) + ax := make([]uint64, 4) + ay := make([]uint64, 4) + res := make([]uint64, 4) + res2 := make([]uint64, 4) + fromBig(ax, x1) + fromBig(ay, y1) + p256Mul(res2, ax, ay) + p256FromMont(res, res2) + resInt := toBigInt(res) + + expected := new(big.Int).Mul(x, y) + expected = expected.Mod(expected, p) + if resInt.Cmp(expected) != 0 { + t.FailNow() + } +} + +func TestFuzzyP256Mul(t *testing.T) { + p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16) + r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) + var scalar1 [32]byte + var scalar2 [32]byte + var timeout *time.Timer + + if testing.Short() { + timeout = time.NewTimer(10 * time.Millisecond) + } else { + timeout = time.NewTimer(2 * time.Second) + } + for { + select { + case <-timeout.C: + return + default: + } + io.ReadFull(rand.Reader, scalar1[:]) + io.ReadFull(rand.Reader, scalar2[:]) + x := new(big.Int).SetBytes(scalar1[:]) + y := new(big.Int).SetBytes(scalar2[:]) + p256MulTest(t, x, y, p, r) + } +} + func Test_p256MulSqr(t *testing.T) { r, _ := new(big.Int).SetString("10000000000000000000000000000000000000000000000000000000000000000", 16) p, _ := new(big.Int).SetString("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF", 16)