diff --git a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S b/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S index 342b844dd6..3e72ebd67f 100644 --- a/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S +++ b/third_party/s2n-bignum/arm/fastmul/bignum_emontredc_8n_neon.S @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- -// Extend Montgomery reduce in 8-digit blocks, results in input-output buffer +// Extended Montgomery reduce in 8-digit blocks, results in input-output buffer // Inputs z[2*k], m[k], w; outputs function return (extra result bit) and z[2*k] // // extern uint64_t bignum_emontredc_8n_neon diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S index 1e31f070b9..ea0bef702c 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_16_neon.S @@ -12,7 +12,7 @@ // (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); // // Initial version written by Hanno Becker -// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = idx // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" diff --git a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S index 773a6d5745..c3dc386990 100644 --- a/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S +++ b/third_party/s2n-bignum/arm/generic/bignum_copy_row_from_table_32_neon.S @@ -12,7 +12,7 @@ // (uint64_t *z, uint64_t *table, uint64_t height, uint64_t idx); // // Initial version written by Hanno Becker -// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X4 = idx +// Standard ARM ABI: X0 = z, X1 = table, X2 = height, X3 = idx // ---------------------------------------------------------------------------- #include "_internal_s2n_bignum.h" diff --git a/third_party/s2n-bignum/arm/p384/p384_montjadd.S b/third_party/s2n-bignum/arm/p384/p384_montjadd.S index 3604313273..4f4a60d260 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjadd.S +++ b/third_party/s2n-bignum/arm/p384/p384_montjadd.S @@ -32,7 +32,9 @@ #define NUMSIZE 48 -#define NSPACE (NUMSIZE*7) +// 7 NUMSIZEs for the point operation, one extra NUMSIZE for field operations + +#define NSPACE (NUMSIZE*8) S2N_BN_SYMBOL(p384_montjadd): @@ -661,7 +663,7 @@ S2N_BN_SYMBOL(p384_montjadd): stp x7, x11, [sp, #256] adc x17, x14, xzr stp x2, x17, [sp, #272] - stp x23, x24, [sp, #-48] + stp x23, x24, [sp, #0x150] // It is #-48 after inlining, but access to sp+negative in the middle of fn is bad ldr q3, [x26, #96] ldr q25, [x25, #48] ldp x13, x23, [x25, #48] @@ -5902,7 +5904,7 @@ S2N_BN_SYMBOL(p384_montjadd): stp x14, x5, [sp, #256] adc x12, x15, x23 stp x21, x12, [sp, #272] - ldp x2, x27, [sp, #-48] + ldp x2, x27, [sp, #0x150] // It is #-48 after inlining, but access to sp+negative in the middle of fn is bad ldr q3, [sp, #48] ldr q25, [sp, #192] ldp x13, x23, [sp, #192] diff --git a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S b/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S index 11b5215b4c..2bd405e245 100644 --- a/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S +++ b/third_party/s2n-bignum/arm/p384/p384_montjscalarmul.S @@ -503,7 +503,7 @@ p384_montjscalarmul_p384_montjadd: stp x23, x24, [sp, #-16]! stp x25, x26, [sp, #-16]! stp x27, xzr, [sp, #-16]! - sub sp, sp, #0x150 + sub sp, sp, #0x180 mov x24, x0 mov x25, x1 mov x26, x2 @@ -1120,7 +1120,7 @@ p384_montjscalarmul_p384_montjadd: stp x7, x11, [sp, #256] adc x17, x14, xzr stp x2, x17, [sp, #272] - stp x23, x24, [sp, #-48] + stp x23, x24, [sp, #0x150] ldr q3, [x26, #96] ldr q25, [x25, #48] ldp x13, x23, [x25, #48] @@ -6361,7 +6361,7 @@ p384_montjscalarmul_p384_montjadd: stp x14, x5, [sp, #256] adc x12, x15, x23 stp x21, x12, [sp, #272] - ldp x2, x27, [sp, #-48] + ldp x2, x27, [sp, #0x150] ldr q3, [sp, #48] ldr q25, [sp, #192] ldp x13, x23, [sp, #192] @@ -6874,7 +6874,7 @@ p384_montjscalarmul_p384_montjadd: stp x12, x13, [x27, #96] stp x14, x15, [x27, #112] stp x16, x17, [x27, #128] - add sp, sp, #0x150 + add sp, sp, #0x180 ldp x27, xzr, [sp], #16 ldp x25, x26, [sp], #16 ldp x23, x24, [sp], #16 diff --git a/third_party/s2n-bignum/arm/p521/Makefile b/third_party/s2n-bignum/arm/p521/Makefile index 3e5e0e855c..3936b48307 100644 --- a/third_party/s2n-bignum/arm/p521/Makefile +++ b/third_party/s2n-bignum/arm/p521/Makefile @@ -28,6 +28,7 @@ OBJ = bignum_add_p521.o \ bignum_double_p521.o \ bignum_fromlebytes_p521.o \ bignum_half_p521.o \ + bignum_inv_p521.o \ bignum_mod_n521_9.o \ bignum_mod_p521_9.o \ bignum_montmul_p521.o \ @@ -53,7 +54,9 @@ OBJ = bignum_add_p521.o \ p521_jdouble.o \ p521_jdouble_alt.o \ p521_jmixadd.o \ - p521_jmixadd_alt.o + p521_jmixadd_alt.o \ + p521_jscalarmul.o \ + p521_jscalarmul_alt.o %.o : %.S ; $(CC) -E -I../../include $< | $(GAS) -o $@ - diff --git a/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S b/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S new file mode 100644 index 0000000000..731b9784d6 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/bignum_inv_p521.S @@ -0,0 +1,1696 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_521 = 2^521 - 1 +// Input x[9]; output z[9] +// +// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]); +// +// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible +// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that +// x does not need to be reduced modulo p_521, but the output always is. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(9*N) +#define u sp, #(18*N) +#define v sp, #(27*N) + +// Total size to reserve on the stack + +#define NSPACE #(36*N) + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x8, x4, #0x100, lsl #12; \ + sbfx x8, x8, #21, #21; \ + mov x11, #0x100000; \ + add x11, x11, x11, lsl #21; \ + add x9, x4, x11; \ + asr x9, x9, #42; \ + add x10, x5, #0x100, lsl #12; \ + sbfx x10, x10, #21, #21; \ + add x11, x5, x11; \ + asr x11, x11, #42; \ + mul x6, x8, x2; \ + mul x7, x9, x3; \ + mul x2, x10, x2; \ + mul x3, x11, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #21, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #42; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #21, #21; \ + add x15, x5, x15; \ + asr x15, x15, #42; \ + mul x6, x12, x2; \ + mul x7, x13, x3; \ + mul x2, x14, x2; \ + mul x3, x15, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x6, x14, x8; \ + mul x7, x14, x9; \ + madd x8, x13, x10, x2; \ + madd x9, x13, x11, x3; \ + madd x16, x15, x10, x6; \ + madd x17, x15, x11, x7; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #22, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #43; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #22, #21; \ + add x15, x5, x15; \ + asr x15, x15, #43; \ + mneg x2, x12, x8; \ + mneg x3, x12, x9; \ + mneg x4, x14, x8; \ + mneg x5, x14, x9; \ + msub m00, x13, x16, x2; \ + msub m01, x13, x17, x3; \ + msub m10, x15, x16, x4; \ + msub m11, x15, x17, x5 + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(bignum_inv_p521): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime p_521 = 2^521 - 1 into the f variable + + mov x10, #0xFFFFFFFFFFFFFFFF + stp x10, x10, [f] + stp x10, x10, [f+16] + stp x10, x10, [f+32] + stp x10, x10, [f+48] + mov x11, #0x1FF + str x11, [f+64] + +// Copy the input into the g variable, but reduce it strictly mod p_521 +// so that g <= f as assumed in the bound proof. This code fragment is +// very similar to bignum_mod_p521_9 complete with carry condensation. + + ldr x8, [x1, #64] + lsr x9, x8, #9 + + subs xzr, xzr, xzr + ldp x10, x11, [x1] + adcs xzr, x10, x9 + adcs xzr, x11, xzr + ldp x12, x13, [x1, #16] + and x7, x12, x13 + adcs xzr, x7, xzr + ldp x14, x15, [x1, #32] + and x7, x14, x15 + adcs xzr, x7, xzr + ldp x16, x17, [x1, #48] + and x7, x16, x17 + adcs xzr, x7, xzr + orr x7, x8, #~0x1FF + adcs x7, x7, xzr + + adcs x10, x10, x9 + adcs x11, x11, xzr + adcs x12, x12, xzr + adcs x13, x13, xzr + adcs x14, x14, xzr + adcs x15, x15, xzr + adcs x16, x16, xzr + adcs x17, x17, xzr + adc x8, x8, xzr + and x8, x8, #0x1FF + + stp x10, x11, [g] + stp x12, x13, [g+16] + stp x14, x15, [g+32] + stp x16, x17, [g+48] + str x8, [g+64] + +// Also maintain weakly reduced < 2*p_521 vector [u,v] such that +// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521) +// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521) +// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have +// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple. +// +// Based on the standard divstep bound, for inputs <= 2^b we need at least +// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations. +// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59 +// making *1239* total. (With a bit more effort we could avoid the full 59 +// divsteps and use a shorter tail computation, but we keep it simple.) +// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since +// |f| = 1 we get the modular inverse from u by flipping its sign with f. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+16] + stp xzr, xzr, [u+32] + stp xzr, xzr, [u+48] + str xzr, [u+64] + + mov x10, #16 + stp xzr, xzr, [v] + stp xzr, xzr, [v+16] + stp xzr, x10, [v+32] + stp xzr, xzr, [v+48] + str xzr, [v+64] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special 21st iteration after a uniform +// first 20. + + mov i, #21 + mov d, #1 + b midloop + +loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digit 3 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + +// Digit 4 of [f,g] + + ldr x7, [f+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [g+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [f+3*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [g+3*N] + +// Digit 5 of [f,g] + + ldr x7, [f+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [g+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [g+4*N] + +// Digit 6 of [f,g] + + ldr x7, [f+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f+5*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g+5*N] + +// Digit 7 of [f,g] + + ldr x7, [f+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+6*N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+6*N] + +// Digits 8 and 9 of [f,g] + + ldr x7, [f+8*N] + eor x1, x7, s00 + asr x3, x1, #63 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+8*N] + eor x1, x8, s01 + asr x0, x1, #63 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+7*N] + extr x5, x3, x5, #59 + str x5, [f+8*N] + + eor x1, x7, s10 + asr x5, x1, #63 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + asr x0, x1, #63 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+7*N] + extr x2, x5, x2, #59 + str x2, [g+8*N] + +// Now the computation of the updated u and v values and their +// modular reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digit 3 of [u,v] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x6, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + str x2, [v+3*N] + adc x6, x6, x1 + +// Digit 4 of [u,v] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x6, x6, x0 + adc x5, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x6, x6, x0 + str x6, [v+4*N] + adc x5, x5, x1 + +// Digit 5 of [u,v] + + ldr x7, [u+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, x5, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v+5*N] + adc x3, x3, x1 + +// Digit 6 of [u,v] + + ldr x7, [u+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+6*N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+6*N] + adc x4, x4, x1 + +// Digit 7 of [u,v] + + ldr x7, [u+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+7*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+7*N] + adc x2, x2, x1 + +// Digits 8 and 9 of u (top is unsigned) + + ldr x7, [u+8*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+8*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 + + extr x6, x3, x5, #9 + ldp x0, x1, [u] + add x6, x6, x3, asr #63 + sub x5, x5, x6, lsl #9 + adds x0, x0, x6 + asr x6, x6, #63 + adcs x1, x1, x6 + stp x0, x1, [u] + ldp x0, x1, [u+16] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+16] + ldp x0, x1, [u+32] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+32] + ldp x0, x1, [u+48] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [u+48] + adc x5, x5, x6 + str x5, [u+64] + +// Digits 8 and 9 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Modular reduction of v, reloading as needed from v[0],...,v[7],x2,x5 + + extr x6, x5, x2, #9 + ldp x0, x1, [v] + add x6, x6, x5, asr #63 + sub x2, x2, x6, lsl #9 + adds x0, x0, x6 + asr x6, x6, #63 + adcs x1, x1, x6 + stp x0, x1, [v] + ldp x0, x1, [v+16] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+16] + ldp x0, x1, [v+32] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+32] + ldp x0, x1, [v+48] + adcs x0, x0, x6 + adcs x1, x1, x6 + stp x0, x1, [v+48] + adc x2, x2, x6 + str x2, [v+64] + +midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne loop + +// The 21st and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_521) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digit 3 of [u] + + ldr x7, [u+3*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, xzr, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + str x5, [u+3*N] + adc x3, x3, x1 + +// Digit 4 of [u] + + ldr x7, [u+4*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x3, x3, x0 + adc x4, xzr, x1 + ldr x8, [v+4*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x3, x3, x0 + str x3, [u+4*N] + adc x4, x4, x1 + +// Digit 5 of [u] + + ldr x7, [u+5*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, x4, x0 + adc x2, xzr, x1 + ldr x8, [v+5*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u+5*N] + adc x2, x2, x1 + +// Digit 6 of [u] + + ldr x7, [u+6*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+6*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+6*N] + adc x6, x6, x1 + +// Digit 7 of [u] + + ldr x7, [u+7*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+7*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+7*N] + adc x5, x5, x1 + +// Digits 8 and 9 of u (top is unsigned) + + ldr x7, [u+8*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+8*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Modular reduction of u, reloading as needed from u[0],...,u[7],x5,x3 + + extr x6, x3, x5, #9 + ldp x10, x11, [u] + add x6, x6, x3, asr #63 + sub x5, x5, x6, lsl #9 + adds x10, x10, x6 + asr x6, x6, #63 + adcs x11, x11, x6 + ldp x12, x13, [u+16] + adcs x12, x12, x6 + adcs x13, x13, x6 + ldp x14, x15, [u+32] + adcs x14, x14, x6 + adcs x15, x15, x6 + ldp x16, x17, [u+48] + adcs x16, x16, x6 + adcs x17, x17, x6 + adc x19, x5, x6 + +// Further strict reduction ready for the output, which just means +// a conditional subtraction of p_521 + + subs x0, x10, #-1 + adcs x1, x11, xzr + adcs x2, x12, xzr + adcs x3, x13, xzr + adcs x4, x14, xzr + adcs x5, x15, xzr + adcs x6, x16, xzr + adcs x7, x17, xzr + mov x8, #0x1FF + sbcs x8, x19, x8 + + csel x0, x0, x10, cs + csel x1, x1, x11, cs + csel x2, x2, x12, cs + csel x3, x3, x13, cs + csel x4, x4, x14, cs + csel x5, x5, x15, cs + csel x6, x6, x16, cs + csel x7, x7, x17, cs + csel x8, x8, x19, cs + +// Store it back to the final output + + stp x0, x1, [res] + stp x2, x3, [res, #16] + stp x4, x5, [res, #32] + stp x6, x7, [res, #48] + str x8, [res, #64] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jadd.S b/third_party/s2n-bignum/arm/p521/p521_jadd.S index 340766e6a2..6dbcad2b7b 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jadd.S +++ b/third_party/s2n-bignum/arm/p521/p521_jadd.S @@ -347,1052 +347,1220 @@ S2N_BN_SYMBOL(p521_jadd): ldp x19, x20, [sp], 16 ret -// Local versions of the three field operations, almost identical to -// bignum_mul_p521, bignum_sqr_p521 and bignum_sub_p521 except for -// avoiding all intial register save-restore, and in the case of -// local_mul_p521, using the tmp buffer as temporary storage and -// avoiding x26. +// Local versions of the three field operations, identical to +// bignum_mul_p521_neon, bignum_sqr_p521_neon and bignum_sub_p521. local_mul_p521: - ldp x3, x4, [x1] - ldp x5, x6, [x1, #16] - ldp x7, x8, [x2] - ldp x9, x10, [x2, #16] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - lsl x21, x11, #9 - extr x11, x12, x11, #55 - extr x12, x13, x12, #55 - extr x13, x14, x13, #55 - lsr x14, x14, #55 - ldp x3, x4, [x1, #32] - ldp x5, x6, [x1, #48] - ldp x7, x8, [x2, #32] - ldp x9, x10, [x2, #48] - stp x15, x16, [tmp] - stp x17, x19, [tmp+16] - stp x21, x11, [tmp+32] - stp x12, x13, [tmp+48] - str x14, [tmp+64] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - ldp x23, x22, [tmp] - adds x11, x11, x23 - adcs x12, x12, x22 - stp x11, x12, [tmp] - ldp x23, x22, [tmp+16] - adcs x13, x13, x23 - adcs x14, x14, x22 - stp x13, x14, [tmp+16] - ldp x23, x22, [tmp+32] - adcs x15, x15, x23 - adcs x16, x16, x22 - stp x15, x16, [tmp+32] - ldp x23, x22, [tmp+48] - adcs x17, x17, x23 - adcs x19, x19, x22 - stp x17, x19, [tmp+48] - ldr x21, [tmp+64] - adc x21, x21, xzr - str x21, [tmp+64] - ldp x23, x22, [x1] - subs x3, x3, x23 - sbcs x4, x4, x22 - ldp x23, x22, [x1, #16] - sbcs x5, x5, x23 - sbcs x6, x6, x22 - csetm x24, lo - ldp x23, x22, [x2] - subs x7, x23, x7 - sbcs x8, x22, x8 - ldp x23, x22, [x2, #16] - sbcs x9, x23, x9 - sbcs x10, x22, x10 - csetm x25, lo - eor x3, x3, x24 - subs x3, x3, x24 - eor x4, x4, x24 - sbcs x4, x4, x24 - eor x5, x5, x24 - sbcs x5, x5, x24 - eor x6, x6, x24 - sbc x6, x6, x24 - eor x7, x7, x25 - subs x7, x7, x25 - eor x8, x8, x25 - sbcs x8, x8, x25 - eor x9, x9, x25 - sbcs x9, x9, x25 - eor x10, x10, x25 - sbc x10, x10, x25 - eor x25, x25, x24 - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - ldp x3, x4, [tmp] - ldp x5, x6, [tmp+16] - eor x11, x11, x25 - adds x11, x11, x3 - eor x12, x12, x25 - adcs x12, x12, x4 - eor x13, x13, x25 - adcs x13, x13, x5 - eor x14, x14, x25 - adcs x14, x14, x6 - eor x15, x15, x25 - ldp x7, x8, [tmp+32] - ldp x9, x10, [tmp+48] - ldr x20, [tmp+64] - adcs x15, x15, x7 - eor x16, x16, x25 - adcs x16, x16, x8 - eor x17, x17, x25 - adcs x17, x17, x9 - eor x19, x19, x25 - adcs x19, x19, x10 - adc x21, x20, xzr - adds x15, x15, x3 - adcs x16, x16, x4 - adcs x17, x17, x5 - adcs x19, x19, x6 - and x25, x25, #0x1ff - lsl x24, x11, #9 - orr x24, x24, x25 - adcs x7, x7, x24 - extr x24, x12, x11, #55 - adcs x8, x8, x24 - extr x24, x13, x12, #55 - adcs x9, x9, x24 - extr x24, x14, x13, #55 - adcs x10, x10, x24 - lsr x24, x14, #55 - adc x20, x24, x20 - ldr x6, [x2, #64] - ldp x3, x4, [x1] - and x23, x3, #0xfffffffffffff - mul x23, x6, x23 - ldr x14, [x1, #64] - ldp x11, x12, [x2] - and x24, x11, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - extr x24, x4, x3, #52 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x12, x11, #52 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #12 - adds x15, x15, x24 - ldp x5, x3, [x1, #16] - ldp x13, x11, [x2, #16] - extr x24, x5, x4, #40 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x13, x12, #40 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #24 - adcs x16, x16, x24 - extr x24, x3, x5, #28 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x11, x13, #28 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #36 - adcs x17, x17, x24 - and x25, x16, x17 - ldp x4, x5, [x1, #32] - ldp x12, x13, [x2, #32] - extr x24, x4, x3, #16 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x12, x11, #16 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsl x21, x21, #48 - add x23, x23, x21 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #48 - adcs x19, x19, x24 - and x25, x25, x19 - lsr x24, x4, #4 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - lsr x24, x12, #4 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x21, x22, x23, #60 - extr x24, x5, x4, #56 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x13, x12, #56 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x21, x21, #8 - extr x24, x23, x21, #8 - adcs x7, x7, x24 - and x25, x25, x7 - ldp x3, x4, [x1, #48] - ldp x11, x12, [x2, #48] - extr x24, x3, x5, #44 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x11, x13, #44 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #20 - adcs x8, x8, x24 - and x25, x25, x8 - extr x24, x4, x3, #32 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x12, x11, #32 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #32 - adcs x9, x9, x24 - and x25, x25, x9 - lsr x24, x4, #20 - mul x22, x6, x24 - lsr x24, x12, #20 - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #44 - adcs x10, x10, x24 - and x25, x25, x10 - mul x24, x6, x14 - lsr x22, x22, #44 - add x24, x24, x22 - adc x20, x20, x24 - lsr x22, x20, #9 - orr x20, x20, #0xfffffffffffffe00 - cmp xzr, xzr - adcs xzr, x15, x22 - adcs xzr, x25, xzr - adcs xzr, x20, xzr - adcs x15, x15, x22 - adcs x16, x16, xzr - adcs x17, x17, xzr - adcs x19, x19, xzr - adcs x7, x7, xzr - adcs x8, x8, xzr - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x20, x20, xzr - and x22, x15, #0x1ff - extr x15, x16, x15, #9 - extr x16, x17, x16, #9 - stp x15, x16, [x0] - extr x17, x19, x17, #9 - extr x19, x7, x19, #9 - stp x17, x19, [x0, #16] - extr x7, x8, x7, #9 - extr x8, x9, x8, #9 - stp x7, x8, [x0, #32] - extr x9, x10, x9, #9 - extr x10, x20, x10, #9 - stp x9, x10, [x0, #48] - str x22, [x0, #64] + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 ret local_sqr_p521: - ldp x2, x3, [x1] - ldp x4, x5, [x1, #16] - ldp x6, x7, [x1, #32] - ldp x8, x9, [x1, #48] - mul x12, x6, x8 - mul x17, x7, x9 - umulh x22, x6, x8 - subs x23, x6, x7 - cneg x23, x23, cc - csetm x11, cc - subs x10, x9, x8 - cneg x10, x10, cc - mul x16, x23, x10 - umulh x10, x23, x10 - cinv x11, x11, cc - eor x16, x16, x11 - eor x10, x10, x11 - adds x13, x12, x22 - adc x22, x22, xzr - umulh x23, x7, x9 - adds x13, x13, x17 - adcs x22, x22, x23 - adc x23, x23, xzr - adds x22, x22, x17 - adc x23, x23, xzr - cmn x11, #0x1 - adcs x13, x13, x16 - adcs x22, x22, x10 - adc x23, x23, x11 - adds x12, x12, x12 - adcs x13, x13, x13 - adcs x22, x22, x22 - adcs x23, x23, x23 - adc x19, xzr, xzr - mul x10, x6, x6 - mul x16, x7, x7 - mul x21, x6, x7 - umulh x11, x6, x6 - umulh x17, x7, x7 - umulh x20, x6, x7 - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x12, x12, x16 - adcs x13, x13, x17 - adcs x22, x22, xzr - adcs x23, x23, xzr - adc x19, x19, xzr - mul x14, x8, x8 - mul x16, x9, x9 - mul x21, x8, x9 - umulh x15, x8, x8 - umulh x17, x9, x9 - umulh x20, x8, x9 - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x14, x14, x22 - adcs x15, x15, x23 - adcs x16, x16, x19 - adc x17, x17, xzr - ldr x19, [x1, #64] - add x23, x19, x19 - mul x19, x19, x19 - and x21, x2, #0xfffffffffffff - mul x21, x23, x21 - extr x20, x3, x2, #52 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #12 - adds x10, x10, x22 - extr x21, x4, x3, #40 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #24 - adcs x11, x11, x22 - extr x20, x5, x4, #28 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #36 - adcs x12, x12, x22 - extr x21, x6, x5, #16 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #48 - adcs x13, x13, x22 - lsr x20, x6, #4 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x24, x20, x21, #60 - extr x21, x7, x6, #56 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x24, x24, #8 - extr x22, x21, x24, #8 - adcs x14, x14, x22 - extr x20, x8, x7, #44 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #20 - adcs x15, x15, x22 - extr x21, x9, x8, #32 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #32 - adcs x16, x16, x22 - lsr x20, x9, #20 - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #44 - adcs x17, x17, x22 - lsr x20, x20, #44 - adc x19, x19, x20 - extr x21, x11, x10, #9 - extr x20, x12, x11, #9 - stp x21, x20, [x0] - extr x21, x13, x12, #9 - extr x20, x14, x13, #9 - stp x21, x20, [x0, #16] - extr x21, x15, x14, #9 - extr x20, x16, x15, #9 - stp x21, x20, [x0, #32] - extr x21, x17, x16, #9 - extr x20, x19, x17, #9 - stp x21, x20, [x0, #48] - and x22, x10, #0x1ff - lsr x19, x19, #9 - add x22, x22, x19 - str x22, [x0, #64] - mul x12, x2, x4 - mul x17, x3, x5 - umulh x22, x2, x4 - subs x23, x2, x3 - cneg x23, x23, cc - csetm x11, cc - subs x10, x5, x4 - cneg x10, x10, cc - mul x16, x23, x10 - umulh x10, x23, x10 - cinv x11, x11, cc - eor x16, x16, x11 - eor x10, x10, x11 - adds x13, x12, x22 - adc x22, x22, xzr - umulh x23, x3, x5 - adds x13, x13, x17 - adcs x22, x22, x23 - adc x23, x23, xzr - adds x22, x22, x17 - adc x23, x23, xzr - cmn x11, #0x1 - adcs x13, x13, x16 - adcs x22, x22, x10 - adc x23, x23, x11 - adds x12, x12, x12 - adcs x13, x13, x13 - adcs x22, x22, x22 - adcs x23, x23, x23 - adc x19, xzr, xzr - mul x10, x2, x2 - mul x16, x3, x3 - mul x21, x2, x3 - umulh x11, x2, x2 - umulh x17, x3, x3 - umulh x20, x2, x3 - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x12, x12, x16 - adcs x13, x13, x17 - adcs x22, x22, xzr - adcs x23, x23, xzr - adc x19, x19, xzr - mul x14, x4, x4 - mul x16, x5, x5 - mul x21, x4, x5 - umulh x15, x4, x4 - umulh x17, x5, x5 - umulh x20, x4, x5 - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x14, x14, x22 - adcs x15, x15, x23 - adcs x16, x16, x19 - adc x17, x17, xzr - ldp x21, x20, [x0] - adds x21, x21, x10 - adcs x20, x20, x11 - stp x21, x20, [x0] - ldp x21, x20, [x0, #16] - adcs x21, x21, x12 - adcs x20, x20, x13 - stp x21, x20, [x0, #16] - ldp x21, x20, [x0, #32] - adcs x21, x21, x14 - adcs x20, x20, x15 - stp x21, x20, [x0, #32] - ldp x21, x20, [x0, #48] - adcs x21, x21, x16 - adcs x20, x20, x17 - stp x21, x20, [x0, #48] - ldr x22, [x0, #64] - adc x22, x22, xzr - str x22, [x0, #64] - mul x10, x2, x6 - mul x14, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - umulh x17, x2, x6 - adds x14, x14, x17 - umulh x17, x3, x7 - adcs x15, x15, x17 - umulh x17, x4, x8 - adcs x16, x16, x17 - umulh x17, x5, x9 - adc x17, x17, xzr - adds x11, x14, x10 - adcs x14, x15, x14 - adcs x15, x16, x15 - adcs x16, x17, x16 - adc x17, xzr, x17 - adds x12, x14, x10 - adcs x13, x15, x11 - adcs x14, x16, x14 - adcs x15, x17, x15 - adcs x16, xzr, x16 - adc x17, xzr, x17 - subs x22, x4, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x8 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x15, x15, x21 - eor x20, x20, x19 - adcs x16, x16, x20 - adc x17, x17, x19 - subs x22, x2, x3 - cneg x22, x22, cc - csetm x19, cc - subs x20, x7, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x11, x11, x21 - eor x20, x20, x19 - adcs x12, x12, x20 - adcs x13, x13, x19 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x7 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x14, x14, x21 - eor x20, x20, x19 - adcs x15, x15, x20 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x4 - cneg x22, x22, cc - csetm x19, cc - subs x20, x8, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x12, x12, x21 - eor x20, x20, x19 - adcs x13, x13, x20 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x4 - cneg x22, x22, cc - csetm x19, cc - subs x20, x8, x7 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - ldp x21, x20, [x0] - extr x2, x15, x14, #8 - adds x2, x2, x21 - extr x3, x16, x15, #8 - adcs x3, x3, x20 - ldp x21, x20, [x0, #16] - extr x4, x17, x16, #8 - adcs x4, x4, x21 - and x22, x3, x4 - lsr x5, x17, #8 - adcs x5, x5, x20 - and x22, x22, x5 - ldp x21, x20, [x0, #32] - lsl x6, x10, #1 - adcs x6, x6, x21 - and x22, x22, x6 - extr x7, x11, x10, #63 - adcs x7, x7, x20 - and x22, x22, x7 - ldp x21, x20, [x0, #48] - extr x8, x12, x11, #63 - adcs x8, x8, x21 - and x22, x22, x8 - extr x9, x13, x12, #63 - adcs x9, x9, x20 - and x22, x22, x9 - ldr x21, [x0, #64] - extr x10, x14, x13, #63 - and x10, x10, #0x1ff - adc x10, x21, x10 - lsr x20, x10, #9 - orr x10, x10, #0xfffffffffffffe00 - cmp xzr, xzr - adcs xzr, x2, x20 - adcs xzr, x22, xzr - adcs xzr, x10, xzr - adcs x2, x2, x20 - adcs x3, x3, xzr - adcs x4, x4, xzr - adcs x5, x5, xzr - adcs x6, x6, xzr - adcs x7, x7, xzr - adcs x8, x8, xzr - adcs x9, x9, xzr - adc x10, x10, xzr - and x10, x10, #0x1ff - stp x2, x3, [x0] - stp x4, x5, [x0, #16] - stp x6, x7, [x0, #32] - stp x8, x9, [x0, #48] - str x10, [x0, #64] + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 ret local_sub_p521: diff --git a/third_party/s2n-bignum/arm/p521/p521_jdouble.S b/third_party/s2n-bignum/arm/p521/p521_jdouble.S index 3eb0250b33..73afe4ffbd 100644 --- a/third_party/s2n-bignum/arm/p521/p521_jdouble.S +++ b/third_party/s2n-bignum/arm/p521/p521_jdouble.S @@ -54,7 +54,7 @@ #define dx2 sp, #(NUMSIZE*5) #define t1 sp, #(NUMSIZE*5) -#define d sp, #(NUMSIZE*6) +#define d_ sp, #(NUMSIZE*6) #define x4p sp, #(NUMSIZE*6) // NUMSIZE*7 is not 16-aligned so we round it up @@ -460,7 +460,7 @@ S2N_BN_SYMBOL(p521_jdouble): // d = 12 * xy2 - 9 * x4p // t1 = y^2 + 2 * y * z - cmsub_p521(d,12,xy2,9,x4p) + cmsub_p521(d_,12,xy2,9,x4p) sub_p521(t1,t2,z2) // y4 = y^4 @@ -471,11 +471,11 @@ S2N_BN_SYMBOL(p521_jdouble): // dx2 = d * x2p sub_p521(z_3,t1,y2) - mul_p521(dx2,d,x2p) + mul_p521(dx2,d_,x2p) // x' = 4 * xy2 - d - cmsub41_p521(x_3,xy2,d) + cmsub41_p521(x_3,xy2,d_) // y' = 3 * dx2 - 8 * y4 @@ -493,1054 +493,1223 @@ S2N_BN_SYMBOL(p521_jdouble): ldp x19, x20, [sp], 16 ret -// Local versions of the two "big" field operations, almost identical to -// bignum_mul_p521 and bignum_sqr_p521 except for avoiding the intial -// register save-restore, and in the case of local_mul_p521, using the -// output buffer as temporary storage, slightly reordering a few loads -// and stores to make it aliasing-proof. +// Local versions of the two "big" field operations, identical to +// bignum_mul_p521_neon and bignum_sqr_p521_neon. local_mul_p521: - ldp x3, x4, [x1] - ldp x5, x6, [x1, #16] - ldp x7, x8, [x2] - ldp x9, x10, [x2, #16] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - lsl x21, x11, #9 - extr x11, x12, x11, #55 - extr x12, x13, x12, #55 - extr x13, x14, x13, #55 - lsr x14, x14, #55 - ldp x3, x4, [x1, #32] - ldp x5, x6, [x1, #48] - ldp x7, x8, [x2, #32] - ldp x9, x10, [x2, #48] - stp x15, x16, [x0] - stp x17, x19, [x0, #16] - stp x21, x11, [x0, #32] - stp x12, x13, [x0, #48] - str x14, [x0, #64] - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - ldp x23, x22, [x0] - adds x11, x11, x23 - adcs x12, x12, x22 - stp x11, x12, [x0] - ldp x23, x22, [x0, #16] - adcs x13, x13, x23 - adcs x14, x14, x22 - stp x13, x14, [x0, #16] - ldp x23, x22, [x0, #32] - adcs x15, x15, x23 - adcs x16, x16, x22 - stp x15, x16, [x0, #32] - ldp x23, x22, [x0, #48] - adcs x17, x17, x23 - adcs x19, x19, x22 - stp x17, x19, [x0, #48] - ldr x21, [x0, #64] - adc x21, x21, xzr - str x21, [x0, #64] - ldp x23, x22, [x1] - subs x3, x3, x23 - sbcs x4, x4, x22 - ldp x23, x22, [x1, #16] - sbcs x5, x5, x23 - sbcs x6, x6, x22 - csetm x24, lo - ldp x23, x22, [x2] - subs x7, x23, x7 - sbcs x8, x22, x8 - ldp x23, x22, [x2, #16] - sbcs x9, x23, x9 - sbcs x10, x22, x10 - csetm x25, lo - eor x3, x3, x24 - subs x3, x3, x24 - eor x4, x4, x24 - sbcs x4, x4, x24 - eor x5, x5, x24 - sbcs x5, x5, x24 - eor x6, x6, x24 - sbc x6, x6, x24 - eor x7, x7, x25 - subs x7, x7, x25 - eor x8, x8, x25 - sbcs x8, x8, x25 - eor x9, x9, x25 - sbcs x9, x9, x25 - eor x10, x10, x25 - sbc x10, x10, x25 - eor x25, x25, x24 - mul x11, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - mul x17, x6, x10 - umulh x19, x3, x7 - adds x15, x15, x19 - umulh x19, x4, x8 - adcs x16, x16, x19 - umulh x19, x5, x9 - adcs x17, x17, x19 - umulh x19, x6, x10 - adc x19, x19, xzr - adds x12, x15, x11 - adcs x15, x16, x15 - adcs x16, x17, x16 - adcs x17, x19, x17 - adc x19, xzr, x19 - adds x13, x15, x11 - adcs x14, x16, x12 - adcs x15, x17, x15 - adcs x16, x19, x16 - adcs x17, xzr, x17 - adc x19, xzr, x19 - subs x24, x5, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x9 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x16, x16, x23 - eor x22, x22, x21 - adcs x17, x17, x22 - adc x19, x19, x21 - subs x24, x3, x4 - cneg x24, x24, lo - csetm x21, lo - subs x22, x8, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x12, x12, x23 - eor x22, x22, x21 - adcs x13, x13, x22 - adcs x14, x14, x21 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x15, x15, x23 - eor x22, x22, x21 - adcs x16, x16, x22 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x13, x13, x23 - eor x22, x22, x21 - adcs x14, x14, x22 - adcs x15, x15, x21 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x3, x6 - cneg x24, x24, lo - csetm x21, lo - subs x22, x10, x7 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - subs x24, x4, x5 - cneg x24, x24, lo - csetm x21, lo - subs x22, x9, x8 - cneg x22, x22, lo - mul x23, x24, x22 - umulh x22, x24, x22 - cinv x21, x21, lo - cmn x21, #1 - eor x23, x23, x21 - adcs x14, x14, x23 - eor x22, x22, x21 - adcs x15, x15, x22 - adcs x16, x16, x21 - adcs x17, x17, x21 - adc x19, x19, x21 - ldp x3, x4, [x0] - ldp x5, x6, [x0, #16] - eor x11, x11, x25 - adds x11, x11, x3 - eor x12, x12, x25 - adcs x12, x12, x4 - eor x13, x13, x25 - adcs x13, x13, x5 - eor x14, x14, x25 - adcs x14, x14, x6 - eor x15, x15, x25 - ldp x7, x8, [x0, #32] - ldp x9, x10, [x0, #48] - ldr x20, [x0, #64] - adcs x15, x15, x7 - eor x16, x16, x25 - adcs x16, x16, x8 - eor x17, x17, x25 - adcs x17, x17, x9 - eor x19, x19, x25 - adcs x19, x19, x10 - adc x21, x20, xzr - adds x15, x15, x3 - adcs x16, x16, x4 - adcs x17, x17, x5 - adcs x19, x19, x6 - and x25, x25, #0x1ff - lsl x24, x11, #9 - orr x24, x24, x25 - adcs x7, x7, x24 - extr x24, x12, x11, #55 - adcs x8, x8, x24 - extr x24, x13, x12, #55 - adcs x9, x9, x24 - extr x24, x14, x13, #55 - adcs x10, x10, x24 - lsr x24, x14, #55 - adc x20, x24, x20 - ldr x6, [x2, #64] - ldp x3, x4, [x1] - and x23, x3, #0xfffffffffffff - mul x23, x6, x23 - ldr x14, [x1, #64] - ldp x11, x12, [x2] - and x24, x11, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - extr x24, x4, x3, #52 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x12, x11, #52 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #12 - adds x15, x15, x24 - ldp x5, x3, [x1, #16] - ldp x13, x11, [x2, #16] - extr x24, x5, x4, #40 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x13, x12, #40 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #24 - adcs x16, x16, x24 - extr x24, x3, x5, #28 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x11, x13, #28 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #36 - adcs x17, x17, x24 - and x26, x16, x17 - ldp x4, x5, [x1, #32] - ldp x12, x13, [x2, #32] - extr x24, x4, x3, #16 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x12, x11, #16 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsl x21, x21, #48 - add x23, x23, x21 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #48 - adcs x19, x19, x24 - and x26, x26, x19 - lsr x24, x4, #4 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - lsr x24, x12, #4 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x25, x22, x23, #60 - extr x24, x5, x4, #56 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x13, x12, #56 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x25, x25, #8 - extr x24, x23, x25, #8 - adcs x7, x7, x24 - and x26, x26, x7 - ldp x3, x4, [x1, #48] - ldp x11, x12, [x2, #48] - extr x24, x3, x5, #44 - and x24, x24, #0xfffffffffffff - mul x22, x6, x24 - extr x24, x11, x13, #44 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #20 - adcs x8, x8, x24 - and x26, x26, x8 - extr x24, x4, x3, #32 - and x24, x24, #0xfffffffffffff - mul x23, x6, x24 - extr x24, x12, x11, #32 - and x24, x24, #0xfffffffffffff - mul x24, x14, x24 - add x23, x23, x24 - lsr x24, x22, #52 - add x23, x23, x24 - lsl x22, x22, #12 - extr x24, x23, x22, #32 - adcs x9, x9, x24 - and x26, x26, x9 - lsr x24, x4, #20 - mul x22, x6, x24 - lsr x24, x12, #20 - mul x24, x14, x24 - add x22, x22, x24 - lsr x24, x23, #52 - add x22, x22, x24 - lsl x23, x23, #12 - extr x24, x22, x23, #44 - adcs x10, x10, x24 - and x26, x26, x10 - mul x24, x6, x14 - lsr x22, x22, #44 - add x24, x24, x22 - adc x20, x20, x24 - lsr x22, x20, #9 - orr x20, x20, #0xfffffffffffffe00 - cmp xzr, xzr - adcs xzr, x15, x22 - adcs xzr, x26, xzr - adcs xzr, x20, xzr - adcs x15, x15, x22 - adcs x16, x16, xzr - adcs x17, x17, xzr - adcs x19, x19, xzr - adcs x7, x7, xzr - adcs x8, x8, xzr - adcs x9, x9, xzr - adcs x10, x10, xzr - adc x20, x20, xzr - and x22, x15, #0x1ff - extr x15, x16, x15, #9 - extr x16, x17, x16, #9 - stp x15, x16, [x0] - extr x17, x19, x17, #9 - extr x19, x7, x19, #9 - stp x17, x19, [x0, #16] - extr x7, x8, x7, #9 - extr x8, x9, x8, #9 - stp x7, x8, [x0, #32] - extr x9, x10, x9, #9 - extr x10, x20, x10, #9 - stp x9, x10, [x0, #48] - str x22, [x0, #64] + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 ret local_sqr_p521: - ldp x2, x3, [x1] - ldp x4, x5, [x1, #16] - ldp x6, x7, [x1, #32] - ldp x8, x9, [x1, #48] - mul x12, x6, x8 - mul x17, x7, x9 - umulh x22, x6, x8 - subs x23, x6, x7 - cneg x23, x23, cc - csetm x11, cc - subs x10, x9, x8 - cneg x10, x10, cc - mul x16, x23, x10 - umulh x10, x23, x10 - cinv x11, x11, cc - eor x16, x16, x11 - eor x10, x10, x11 - adds x13, x12, x22 - adc x22, x22, xzr - umulh x23, x7, x9 - adds x13, x13, x17 - adcs x22, x22, x23 - adc x23, x23, xzr - adds x22, x22, x17 - adc x23, x23, xzr - cmn x11, #0x1 - adcs x13, x13, x16 - adcs x22, x22, x10 - adc x23, x23, x11 - adds x12, x12, x12 - adcs x13, x13, x13 - adcs x22, x22, x22 - adcs x23, x23, x23 - adc x19, xzr, xzr - mul x10, x6, x6 - mul x16, x7, x7 - mul x21, x6, x7 - umulh x11, x6, x6 - umulh x17, x7, x7 - umulh x20, x6, x7 - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x12, x12, x16 - adcs x13, x13, x17 - adcs x22, x22, xzr - adcs x23, x23, xzr - adc x19, x19, xzr - mul x14, x8, x8 - mul x16, x9, x9 - mul x21, x8, x9 - umulh x15, x8, x8 - umulh x17, x9, x9 - umulh x20, x8, x9 - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x14, x14, x22 - adcs x15, x15, x23 - adcs x16, x16, x19 - adc x17, x17, xzr - ldr x19, [x1, #64] - add x23, x19, x19 - mul x19, x19, x19 - and x21, x2, #0xfffffffffffff - mul x21, x23, x21 - extr x20, x3, x2, #52 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #12 - adds x10, x10, x22 - extr x21, x4, x3, #40 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #24 - adcs x11, x11, x22 - extr x20, x5, x4, #28 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #36 - adcs x12, x12, x22 - extr x21, x6, x5, #16 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #48 - adcs x13, x13, x22 - lsr x20, x6, #4 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x24, x20, x21, #60 - extr x21, x7, x6, #56 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x24, x24, #8 - extr x22, x21, x24, #8 - adcs x14, x14, x22 - extr x20, x8, x7, #44 - and x20, x20, #0xfffffffffffff - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #20 - adcs x15, x15, x22 - extr x21, x9, x8, #32 - and x21, x21, #0xfffffffffffff - mul x21, x23, x21 - lsr x22, x20, #52 - add x21, x21, x22 - lsl x20, x20, #12 - extr x22, x21, x20, #32 - adcs x16, x16, x22 - lsr x20, x9, #20 - mul x20, x23, x20 - lsr x22, x21, #52 - add x20, x20, x22 - lsl x21, x21, #12 - extr x22, x20, x21, #44 - adcs x17, x17, x22 - lsr x20, x20, #44 - adc x19, x19, x20 - extr x21, x11, x10, #9 - extr x20, x12, x11, #9 - stp x21, x20, [x0] - extr x21, x13, x12, #9 - extr x20, x14, x13, #9 - stp x21, x20, [x0, #16] - extr x21, x15, x14, #9 - extr x20, x16, x15, #9 - stp x21, x20, [x0, #32] - extr x21, x17, x16, #9 - extr x20, x19, x17, #9 - stp x21, x20, [x0, #48] - and x22, x10, #0x1ff - lsr x19, x19, #9 - add x22, x22, x19 - str x22, [x0, #64] - mul x12, x2, x4 - mul x17, x3, x5 - umulh x22, x2, x4 - subs x23, x2, x3 - cneg x23, x23, cc - csetm x11, cc - subs x10, x5, x4 - cneg x10, x10, cc - mul x16, x23, x10 - umulh x10, x23, x10 - cinv x11, x11, cc - eor x16, x16, x11 - eor x10, x10, x11 - adds x13, x12, x22 - adc x22, x22, xzr - umulh x23, x3, x5 - adds x13, x13, x17 - adcs x22, x22, x23 - adc x23, x23, xzr - adds x22, x22, x17 - adc x23, x23, xzr - cmn x11, #0x1 - adcs x13, x13, x16 - adcs x22, x22, x10 - adc x23, x23, x11 - adds x12, x12, x12 - adcs x13, x13, x13 - adcs x22, x22, x22 - adcs x23, x23, x23 - adc x19, xzr, xzr - mul x10, x2, x2 - mul x16, x3, x3 - mul x21, x2, x3 - umulh x11, x2, x2 - umulh x17, x3, x3 - umulh x20, x2, x3 - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x11, x11, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x12, x12, x16 - adcs x13, x13, x17 - adcs x22, x22, xzr - adcs x23, x23, xzr - adc x19, x19, xzr - mul x14, x4, x4 - mul x16, x5, x5 - mul x21, x4, x5 - umulh x15, x4, x4 - umulh x17, x5, x5 - umulh x20, x4, x5 - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x15, x15, x21 - adcs x16, x16, x20 - adc x17, x17, xzr - adds x14, x14, x22 - adcs x15, x15, x23 - adcs x16, x16, x19 - adc x17, x17, xzr - ldp x21, x20, [x0] - adds x21, x21, x10 - adcs x20, x20, x11 - stp x21, x20, [x0] - ldp x21, x20, [x0, #16] - adcs x21, x21, x12 - adcs x20, x20, x13 - stp x21, x20, [x0, #16] - ldp x21, x20, [x0, #32] - adcs x21, x21, x14 - adcs x20, x20, x15 - stp x21, x20, [x0, #32] - ldp x21, x20, [x0, #48] - adcs x21, x21, x16 - adcs x20, x20, x17 - stp x21, x20, [x0, #48] - ldr x22, [x0, #64] - adc x22, x22, xzr - str x22, [x0, #64] - mul x10, x2, x6 - mul x14, x3, x7 - mul x15, x4, x8 - mul x16, x5, x9 - umulh x17, x2, x6 - adds x14, x14, x17 - umulh x17, x3, x7 - adcs x15, x15, x17 - umulh x17, x4, x8 - adcs x16, x16, x17 - umulh x17, x5, x9 - adc x17, x17, xzr - adds x11, x14, x10 - adcs x14, x15, x14 - adcs x15, x16, x15 - adcs x16, x17, x16 - adc x17, xzr, x17 - adds x12, x14, x10 - adcs x13, x15, x11 - adcs x14, x16, x14 - adcs x15, x17, x15 - adcs x16, xzr, x16 - adc x17, xzr, x17 - subs x22, x4, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x8 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x15, x15, x21 - eor x20, x20, x19 - adcs x16, x16, x20 - adc x17, x17, x19 - subs x22, x2, x3 - cneg x22, x22, cc - csetm x19, cc - subs x20, x7, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x11, x11, x21 - eor x20, x20, x19 - adcs x12, x12, x20 - adcs x13, x13, x19 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x7 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x14, x14, x21 - eor x20, x20, x19 - adcs x15, x15, x20 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x4 - cneg x22, x22, cc - csetm x19, cc - subs x20, x8, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x12, x12, x21 - eor x20, x20, x19 - adcs x13, x13, x20 - adcs x14, x14, x19 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x2, x5 - cneg x22, x22, cc - csetm x19, cc - subs x20, x9, x6 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - subs x22, x3, x4 - cneg x22, x22, cc - csetm x19, cc - subs x20, x8, x7 - cneg x20, x20, cc - mul x21, x22, x20 - umulh x20, x22, x20 - cinv x19, x19, cc - cmn x19, #0x1 - eor x21, x21, x19 - adcs x13, x13, x21 - eor x20, x20, x19 - adcs x14, x14, x20 - adcs x15, x15, x19 - adcs x16, x16, x19 - adc x17, x17, x19 - ldp x21, x20, [x0] - extr x2, x15, x14, #8 - adds x2, x2, x21 - extr x3, x16, x15, #8 - adcs x3, x3, x20 - ldp x21, x20, [x0, #16] - extr x4, x17, x16, #8 - adcs x4, x4, x21 - and x22, x3, x4 - lsr x5, x17, #8 - adcs x5, x5, x20 - and x22, x22, x5 - ldp x21, x20, [x0, #32] - lsl x6, x10, #1 - adcs x6, x6, x21 - and x22, x22, x6 - extr x7, x11, x10, #63 - adcs x7, x7, x20 - and x22, x22, x7 - ldp x21, x20, [x0, #48] - extr x8, x12, x11, #63 - adcs x8, x8, x21 - and x22, x22, x8 - extr x9, x13, x12, #63 - adcs x9, x9, x20 - and x22, x22, x9 - ldr x21, [x0, #64] - extr x10, x14, x13, #63 - and x10, x10, #0x1ff - adc x10, x21, x10 - lsr x20, x10, #9 - orr x10, x10, #0xfffffffffffffe00 - cmp xzr, xzr - adcs xzr, x2, x20 - adcs xzr, x22, xzr - adcs xzr, x10, xzr - adcs x2, x2, x20 - adcs x3, x3, xzr - adcs x4, x4, xzr - adcs x5, x5, xzr - adcs x6, x6, xzr - adcs x7, x7, xzr - adcs x8, x8, xzr - adcs x9, x9, xzr - adc x10, x10, xzr - and x10, x10, #0x1ff - stp x2, x3, [x0] - stp x4, x5, [x0, #16] - stp x6, x7, [x0, #32] - stp x8, x9, [x0, #48] - str x10, [x0, #64] + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 ret + #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack, "", %progbits #endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S b/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S new file mode 100644 index 0000000000..37cc923130 --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/p521_jscalarmul.S @@ -0,0 +1,2706 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Jacobian form scalar multiplication for P-521 +// Input scalar[9], point[27]; output res[27] +// +// extern void p521_jscalarmul +// (uint64_t res[static 27], +// uint64_t scalar[static 9], +// uint64_t point[static 27]); +// +// This function is a variant of its affine point version p521_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// a triple (x,y,z) representing the affine point (x/z^2,y/z^3) when +// z is nonzero or the point at infinity (group identity) if z = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-521, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_521) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 +#define JACSIZE (3*NUMSIZE) + +// Safe copies of input res and additional values in variables. + +#define tabup x15 +#define bf x16 +#define sgn x17 +#define j x19 +#define res x20 + +// Intermediate variables on the stack. +// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +// Round up to maintain stack alignment + +#define NSPACE #(55*NUMSIZE+8) + +#define selectblock(I) \ + cmp bf, #(1*I); \ + ldp x10, x11, [tabup]; \ + csel x0, x10, x0, eq; \ + csel x1, x11, x1, eq; \ + ldp x10, x11, [tabup, #16]; \ + csel x2, x10, x2, eq; \ + csel x3, x11, x3, eq; \ + ldp x10, x11, [tabup, #32]; \ + csel x4, x10, x4, eq; \ + csel x5, x11, x5, eq; \ + ldp x10, x11, [tabup, #48]; \ + csel x6, x10, x6, eq; \ + csel x7, x11, x7, eq; \ + ldr x10, [tabup, #64]; \ + csel x8, x10, x8, eq; \ + add tabup, tabup, #JACSIZE + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p521_jscalarmul): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" input argument; others get processed early. + + mov res, x0 + +// Reduce the input scalar mod n_521 and store it to "scalarb". + + mov x19, x2 + add x0, scalarb + bl p521_jscalarmul_bignum_mod_n521_9 + mov x2, x19 + +// Set the tab[0] table entry to the input point = 1 * P, but also +// reduce all coordinates modulo p. In principle we assume reduction +// as a precondition, but this reduces the scope for surprise, e.g. +// making sure that any input with z = 0 is treated as zero, even +// if the other coordinates are not in fact reduced. + + add x0, tab + mov x1, x19 + bl p521_jscalarmul_bignum_mod_p521_9 + + add x0, tab+NUMSIZE + add x1, x19, #NUMSIZE + bl p521_jscalarmul_bignum_mod_p521_9 + + add x0, tab+2*NUMSIZE + add x1, x19, #(2*NUMSIZE) + bl p521_jscalarmul_bignum_mod_p521_9 + +// If bit 520 of the scalar is set, then negate the scalar mod n_521, +// i.e. do scalar |-> n_521 - scalar, and also the point to compensate +// by negating its y coordinate. This further step is not needed by +// the indexing scheme (the top window is only a couple of bits either +// way), but is convenient to exclude a problem with the specific value +// scalar = n_521 - 18, where the last Jacobian addition is of the form +// (n_521 - 9) * P + -(9 * P) and hence is a degenerate doubling case. + + ldp x0, x1, [scalarb] + movbig(x10, #0xbb6f, #0xb71e, #0x9138, #0x6409) + subs x10, x10, x0 + movbig(x11, #0x3bb5, #0xc9b8, #0x899c, #0x47ae) + sbcs x11, x11, x1 + ldp x2, x3, [scalarb+16] + movbig(x12, #0x7fcc, #0x0148, #0xf709, #0xa5d0) + sbcs x12, x12, x2 + movbig(x13, #0x5186, #0x8783, #0xbf2f, #0x966b) + sbcs x13, x13, x3 + ldp x4, x5, [scalarb+32] + mov x14, 0xfffffffffffffffa + sbcs x14, x14, x4 + mov x15, 0xffffffffffffffff + sbcs x15, x15, x5 + ldp x6, x7, [scalarb+48] + mov x16, 0xffffffffffffffff + sbcs x16, x16, x6 + mov x17, 0xffffffffffffffff + sbcs x17, x17, x7 + ldr x8, [scalarb+64] + mov x19, 0x00000000000001ff + sbc x19, x19, x8 + tst x8, 0x100 + csetm x9, ne + csel x0, x10, x0, ne + csel x1, x11, x1, ne + csel x2, x12, x2, ne + csel x3, x13, x3, ne + csel x4, x14, x4, ne + csel x5, x15, x5, ne + csel x6, x16, x6, ne + csel x7, x17, x7, ne + csel x8, x19, x8, ne + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + + add tabup, tab + ldp x0, x1, [tabup, #NUMSIZE] + ldp x2, x3, [tabup, #NUMSIZE+16] + ldp x4, x5, [tabup, #NUMSIZE+32] + ldp x6, x7, [tabup, #NUMSIZE+48] + ldr x8, [tabup, #NUMSIZE+64] + orr x10, x0, x1 + orr x11, x2, x3 + orr x12, x4, x5 + orr x13, x6, x7 + orr x10, x10, x11 + orr x12, x12, x13 + orr x12, x12, x8 + orr x10, x10, x12 + cmp x10, xzr + csel x9, x9, xzr, ne + eor x0, x0, x9 + eor x1, x1, x9 + eor x2, x2, x9 + eor x3, x3, x9 + eor x4, x4, x9 + eor x5, x5, x9 + eor x6, x6, x9 + eor x7, x7, x9 + and x9, x9, #0x1FF + eor x8, x8, x9 + stp x0, x1, [tabup, #NUMSIZE] + stp x2, x3, [tabup, #NUMSIZE+16] + stp x4, x5, [tabup, #NUMSIZE+32] + stp x6, x7, [tabup, #NUMSIZE+48] + str x8, [tabup, #NUMSIZE+64] + +// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P + + add x0, tab+JACSIZE*1 + add x1, tab + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*2 + add x1, tab+JACSIZE*1 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*3 + add x1, tab+JACSIZE*1 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*4 + add x1, tab+JACSIZE*3 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*5 + add x1, tab+JACSIZE*2 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*6 + add x1, tab+JACSIZE*5 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*7 + add x1, tab+JACSIZE*3 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*8 + add x1, tab+JACSIZE*7 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*9 + add x1, tab+JACSIZE*4 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*10 + add x1, tab+JACSIZE*9 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*11 + add x1, tab+JACSIZE*5 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*12 + add x1, tab+JACSIZE*11 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*13 + add x1, tab+JACSIZE*6 + bl p521_jscalarmul_jdouble + + add x0, tab+JACSIZE*14 + add x1, tab+JACSIZE*13 + add x2, tab + bl p521_jscalarmul_jadd + + add x0, tab+JACSIZE*15 + add x1, tab+JACSIZE*7 + bl p521_jscalarmul_jdouble + +// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed +// digits. The digits of the constant, in lowest-to-highest order, are as +// follows; they are generated dynamically since none is a simple ARM load. +// +// 0x0842108421084210 +// 0x1084210842108421 +// 0x2108421084210842 +// 0x4210842108421084 +// 0x8421084210842108 +// 0x0842108421084210 +// 0x1084210842108421 +// 0x2108421084210842 +// 0x0000000000000084 + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + ldp x6, x7, [scalarb+48] + ldr x8, [scalarb+64] + + movbig(x10, #0x1084, #0x2108, #0x4210, #0x8421) + adds x0, x0, x10, lsr #1 + adcs x1, x1, x10 + lsl x10, x10, #1 + adcs x2, x2, x10 + lsl x10, x10, #1 + adcs x3, x3, x10 + lsl x10, x10, #1 + adcs x4, x4, x10 + lsr x11, x10, #4 + adcs x5, x5, x11 + lsr x10, x10, #3 + adcs x6, x6, x10 + lsl x10, x10, #1 + adcs x7, x7, x10 + lsl x10, x10, #1 + and x10, x10, #0xFF + adc x8, x8, x10 + +// Because of the initial reduction the top bitfield (>= bits 520) is <= 1, +// i.e. just a single bit. Record that in "bf", then shift the whole +// scalar left 56 bits to align the top of the next bitfield with the MSB +// (bits 571..575). + + lsr bf, x8, #8 + extr x8, x8, x7, #8 + extr x7, x7, x6, #8 + extr x6, x6, x5, #8 + extr x5, x5, x4, #8 + extr x4, x4, x3, #8 + extr x3, x3, x2, #8 + extr x2, x2, x1, #8 + extr x1, x1, x0, #8 + lsl x0, x0, #56 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + +// According to the top bit, initialize the accumulator to P or 0. This top +// digit, uniquely, is not recoded so there is no sign adjustment to make. +// We only really need to adjust the z coordinate to zero, but do all three. + + add tabup, tab + cmp bf, xzr + + ldp x0, x1, [tabup] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc] + ldp x0, x1, [tabup, #16] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+16] + ldp x0, x1, [tabup, #32] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+32] + ldp x0, x1, [tabup, #48] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+48] + ldp x0, x1, [tabup, #64] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+64] + ldp x0, x1, [tabup, #80] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+80] + ldp x0, x1, [tabup, #96] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+96] + ldp x0, x1, [tabup, #112] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+112] + ldp x0, x1, [tabup, #128] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+128] + ldp x0, x1, [tabup, #144] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+144] + ldp x0, x1, [tabup, #160] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+160] + ldp x0, x1, [tabup, #176] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+176] + ldp x0, x1, [tabup, #192] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+192] + ldr x0, [tabup, #208] + csel x0, x0, xzr, ne + str x0, [acc+208] + +// Main loop over size-5 bitfields: double 5 times then add signed digit +// At each stage we shift the scalar left by 5 bits so we can simply pick +// the top 5 bits as the bitfield, saving some fiddle over indexing. + + mov j, #520 + +p521_jscalarmul_mainloop: + sub j, j, #5 + + add x0, acc + add x1, acc + bl p521_jscalarmul_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_jdouble + +// Choose the bitfield and adjust it to sign and magnitude + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + ldp x6, x7, [scalarb+48] + ldr x8, [scalarb+64] + lsr bf, x8, #59 + extr x8, x8, x7, #59 + extr x7, x7, x6, #59 + extr x6, x6, x5, #59 + extr x5, x5, x4, #59 + extr x4, x4, x3, #59 + extr x3, x3, x2, #59 + extr x2, x2, x1, #59 + extr x1, x1, x0, #59 + lsl x0, x0, #5 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + + subs bf, bf, #16 + csetm sgn, lo // sgn = sign of digit (1 = negative) + cneg bf, bf, lo // bf = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + str x8, [tabent+64] + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab+2*NUMSIZE + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [tabent+2*NUMSIZE] + stp x2, x3, [tabent+2*NUMSIZE+16] + stp x4, x5, [tabent+2*NUMSIZE+32] + stp x6, x7, [tabent+2*NUMSIZE+48] + str x8, [tabent+2*NUMSIZE+64] + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab+NUMSIZE + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + +// Store it to "tabent" with the y coordinate optionally negated. +// This is done carefully to give coordinates < p_521 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + orr x10, x0, x1 + orr x11, x2, x3 + orr x12, x4, x5 + orr x13, x6, x7 + orr x10, x10, x11 + orr x12, x12, x13 + orr x12, x12, x8 + orr x10, x10, x12 + cmp x10, xzr + csel sgn, sgn, xzr, ne + + eor x0, x0, sgn + eor x1, x1, sgn + eor x2, x2, sgn + eor x3, x3, sgn + eor x4, x4, sgn + eor x5, x5, sgn + eor x6, x6, sgn + eor x7, x7, sgn + and sgn, sgn, #0x1FF + eor x8, x8, sgn + + stp x0, x1, [tabent+NUMSIZE] + stp x2, x3, [tabent+NUMSIZE+16] + stp x4, x5, [tabent+NUMSIZE+32] + stp x6, x7, [tabent+NUMSIZE+48] + str x8, [tabent+NUMSIZE+64] + +// Add to the accumulator + + add x0, acc + add x1, acc + add x2, tabent + bl p521_jscalarmul_jadd + + cbnz j, p521_jscalarmul_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + ldp x0, x1, [acc+96] + stp x0, x1, [res, #96] + ldp x0, x1, [acc+112] + stp x0, x1, [res, #112] + ldp x0, x1, [acc+128] + stp x0, x1, [res, #128] + ldp x0, x1, [acc+144] + stp x0, x1, [res, #144] + ldp x0, x1, [acc+160] + stp x0, x1, [res, #160] + ldp x0, x1, [acc+176] + stp x0, x1, [res, #176] + ldp x0, x1, [acc+192] + stp x0, x1, [res, #192] + ldr x0, [acc+208] + str x0, [res, #208] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment except +// that we share multiplication and squaring between the point operations. + +p521_jscalarmul_bignum_mod_p521_9: + ldr x12, [x1, #64] + lsr x2, x12, #9 + cmp xzr, xzr + ldp x4, x5, [x1] + adcs xzr, x4, x2 + adcs xzr, x5, xzr + ldp x6, x7, [x1, #16] + and x3, x6, x7 + adcs xzr, x3, xzr + ldp x8, x9, [x1, #32] + and x3, x8, x9 + adcs xzr, x3, xzr + ldp x10, x11, [x1, #48] + and x3, x10, x11 + adcs xzr, x3, xzr + orr x3, x12, #0xfffffffffffffe00 + adcs x3, x3, xzr + adcs x4, x4, x2 + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adcs x11, x11, xzr + adc x12, x12, xzr + and x12, x12, #0x1ff + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + str x12, [x0, #64] + ret + +p521_jscalarmul_bignum_mod_n521_9: + ldr x14, [x1, #64] + lsr x15, x14, #9 + add x15, x15, #1 + mov x2, #39927 + movk x2, #28359, lsl #16 + movk x2, #18657, lsl #32 + movk x2, #17552, lsl #48 + mul x6, x2, x15 + mov x3, #47185 + movk x3, #30307, lsl #16 + movk x3, #13895, lsl #32 + movk x3, #50250, lsl #48 + mul x7, x3, x15 + mov x4, #23087 + movk x4, #2294, lsl #16 + movk x4, #65207, lsl #32 + movk x4, #32819, lsl #48 + mul x8, x4, x15 + mov x5, #27028 + movk x5, #16592, lsl #16 + movk x5, #30844, lsl #32 + movk x5, #44665, lsl #48 + mul x9, x5, x15 + lsl x10, x15, #2 + add x10, x10, x15 + umulh x13, x2, x15 + adds x7, x7, x13 + umulh x13, x3, x15 + adcs x8, x8, x13 + umulh x13, x4, x15 + adcs x9, x9, x13 + umulh x13, x5, x15 + adc x10, x10, x13 + ldp x12, x13, [x1] + adds x6, x6, x12 + adcs x7, x7, x13 + ldp x12, x13, [x1, #16] + adcs x8, x8, x12 + adcs x9, x9, x13 + ldp x13, x11, [x1, #32] + adcs x10, x10, x13 + adcs x11, x11, xzr + ldp x12, x13, [x1, #48] + adcs x12, x12, xzr + adcs x13, x13, xzr + orr x14, x14, #0xfffffffffffffe00 + adcs x14, x14, xzr + csetm x15, lo + and x2, x2, x15 + subs x6, x6, x2 + and x3, x3, x15 + sbcs x7, x7, x3 + and x4, x4, x15 + sbcs x8, x8, x4 + and x5, x5, x15 + sbcs x9, x9, x5 + mov x2, #5 + and x2, x2, x15 + sbcs x10, x10, x2 + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + sbc x14, x14, xzr + and x14, x14, #0x1ff + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + str x14, [x0, #64] + ret + +p521_jscalarmul_jadd: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! + sub sp, sp, #0x240 + mov x26, x0 + mov x27, x1 + mov x28, x2 + mov x0, sp + add x1, x27, #0x90 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0x168 + add x1, x28, #0x90 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0x1f8 + add x1, x28, #0x90 + add x2, x27, #0x48 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x48 + add x1, x27, #0x90 + add x2, x28, #0x48 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x90 + mov x1, sp + add x2, x28, #0x0 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x168 + add x2, x27, #0x0 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x48 + mov x1, sp + add x2, sp, #0x48 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x1f8 + add x1, sp, #0x168 + add x2, sp, #0x1f8 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0x90 + add x2, sp, #0x120 + bl p521_jscalarmul_sub_p521 + add x0, sp, #0x48 + add x1, sp, #0x48 + add x2, sp, #0x1f8 + bl p521_jscalarmul_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0x168 + bl p521_jscalarmul_sqr_p521 + mov x0, sp + add x1, sp, #0x48 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0x120 + add x1, sp, #0xd8 + add x2, sp, #0x120 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x90 + add x1, sp, #0xd8 + add x2, sp, #0x90 + bl p521_jscalarmul_mul_p521 + mov x0, sp + mov x1, sp + add x2, sp, #0x120 + bl p521_jscalarmul_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0x90 + add x2, sp, #0x120 + bl p521_jscalarmul_sub_p521 + add x0, sp, #0x168 + add x1, sp, #0x168 + add x2, x27, #0x90 + bl p521_jscalarmul_mul_p521 + mov x0, sp + mov x1, sp + add x2, sp, #0x90 + bl p521_jscalarmul_sub_p521 + add x0, sp, #0x120 + add x1, sp, #0x120 + mov x2, sp + bl p521_jscalarmul_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0xd8 + add x2, sp, #0x1f8 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0x168 + add x2, x28, #0x90 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x48 + add x2, sp, #0x120 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x120 + add x2, sp, #0xd8 + bl p521_jscalarmul_sub_p521 + ldp x0, x1, [x27, #144] + ldp x2, x3, [x27, #160] + ldp x4, x5, [x27, #176] + ldp x6, x7, [x27, #192] + ldr x8, [x27, #208] + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + ldp x10, x11, [x28, #144] + ldp x12, x13, [x28, #160] + ldp x14, x15, [x28, #176] + ldp x16, x17, [x28, #192] + ldr x19, [x28, #208] + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + cmp x21, xzr + cset x21, ne + cmp x21, x20 + ldp x10, x11, [sp, #360] + ldp x12, x13, [sp, #376] + ldp x14, x15, [sp, #392] + ldp x16, x17, [sp, #408] + ldr x19, [sp, #424] + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + stp x0, x1, [sp, #360] + stp x2, x3, [sp, #376] + stp x4, x5, [sp, #392] + stp x6, x7, [sp, #408] + str x8, [sp, #424] + ldp x20, x21, [x27] + ldp x0, x1, [sp] + csel x0, x20, x0, cc + csel x1, x21, x1, cc + ldp x20, x21, [x28] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + ldp x20, x21, [x27, #16] + ldp x2, x3, [sp, #16] + csel x2, x20, x2, cc + csel x3, x21, x3, cc + ldp x20, x21, [x28, #16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + ldp x20, x21, [x27, #32] + ldp x4, x5, [sp, #32] + csel x4, x20, x4, cc + csel x5, x21, x5, cc + ldp x20, x21, [x28, #32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + ldp x20, x21, [x27, #48] + ldp x6, x7, [sp, #48] + csel x6, x20, x6, cc + csel x7, x21, x7, cc + ldp x20, x21, [x28, #48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + ldr x20, [x27, #64] + ldr x8, [sp, #64] + csel x8, x20, x8, cc + ldr x21, [x28, #64] + csel x8, x21, x8, hi + ldp x20, x21, [x27, #72] + ldp x10, x11, [sp, #288] + csel x10, x20, x10, cc + csel x11, x21, x11, cc + ldp x20, x21, [x28, #72] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + ldp x20, x21, [x27, #88] + ldp x12, x13, [sp, #304] + csel x12, x20, x12, cc + csel x13, x21, x13, cc + ldp x20, x21, [x28, #88] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + ldp x20, x21, [x27, #104] + ldp x14, x15, [sp, #320] + csel x14, x20, x14, cc + csel x15, x21, x15, cc + ldp x20, x21, [x28, #104] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + ldp x20, x21, [x27, #120] + ldp x16, x17, [sp, #336] + csel x16, x20, x16, cc + csel x17, x21, x17, cc + ldp x20, x21, [x28, #120] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + ldr x20, [x27, #136] + ldr x19, [sp, #352] + csel x19, x20, x19, cc + ldr x21, [x28, #136] + csel x19, x21, x19, hi + stp x0, x1, [x26] + stp x2, x3, [x26, #16] + stp x4, x5, [x26, #32] + stp x6, x7, [x26, #48] + str x8, [x26, #64] + ldp x0, x1, [sp, #360] + ldp x2, x3, [sp, #376] + ldp x4, x5, [sp, #392] + ldp x6, x7, [sp, #408] + ldr x8, [sp, #424] + stp x10, x11, [x26, #72] + stp x12, x13, [x26, #88] + stp x14, x15, [x26, #104] + stp x16, x17, [x26, #120] + str x19, [x26, #136] + stp x0, x1, [x26, #144] + stp x2, x3, [x26, #160] + stp x4, x5, [x26, #176] + stp x6, x7, [x26, #192] + str x8, [x26, #208] + add sp, sp, #0x240 + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_jdouble: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! + sub sp, sp, #0x200 + mov x26, x0 + mov x27, x1 + mov x0, sp + add x1, x27, #0x90 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0x48 + add x1, x27, #0x48 + bl p521_jscalarmul_sqr_p521 + ldp x5, x6, [x27] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x27, #16] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x27, #32] + ldp x4, x3, [sp, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x27, #48] + ldp x4, x3, [sp, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x27, #64] + ldr x4, [sp, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [sp, #216] + stp x7, x8, [sp, #232] + stp x9, x10, [sp, #248] + stp x11, x12, [sp, #264] + str x13, [sp, #280] + cmp xzr, xzr + ldp x5, x6, [x27] + ldp x4, x3, [sp] + adcs x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x27, #16] + ldp x4, x3, [sp, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x27, #32] + ldp x4, x3, [sp, #32] + adcs x9, x9, x4 + adcs x10, x10, x3 + ldp x11, x12, [x27, #48] + ldp x4, x3, [sp, #48] + adcs x11, x11, x4 + adcs x12, x12, x3 + ldr x13, [x27, #64] + ldr x4, [sp, #64] + adc x13, x13, x4 + subs x4, x13, #0x200 + csetm x4, cs + sbcs x5, x5, xzr + and x4, x4, #0x200 + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbc x13, x13, x4 + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + add x0, sp, #0xd8 + add x1, sp, #0x90 + add x2, sp, #0xd8 + bl p521_jscalarmul_mul_p521 + cmp xzr, xzr + ldp x5, x6, [x27, #72] + ldp x4, x3, [x27, #144] + adcs x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x27, #88] + ldp x4, x3, [x27, #160] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x27, #104] + ldp x4, x3, [x27, #176] + adcs x9, x9, x4 + adcs x10, x10, x3 + ldp x11, x12, [x27, #120] + ldp x4, x3, [x27, #192] + adcs x11, x11, x4 + adcs x12, x12, x3 + ldr x13, [x27, #136] + ldr x4, [x27, #208] + adc x13, x13, x4 + subs x4, x13, #0x200 + csetm x4, cs + sbcs x5, x5, xzr + and x4, x4, #0x200 + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbc x13, x13, x4 + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + add x0, sp, #0x120 + add x1, x27, #0x0 + add x2, sp, #0x48 + bl p521_jscalarmul_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0xd8 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0x90 + add x1, sp, #0x90 + bl p521_jscalarmul_sqr_p521 + ldp x6, x7, [sp, #288] + mov x1, #0xc + mul x3, x1, x6 + mul x4, x1, x7 + umulh x6, x1, x6 + adds x4, x4, x6 + umulh x7, x1, x7 + ldp x8, x9, [sp, #304] + mul x5, x1, x8 + mul x6, x1, x9 + umulh x8, x1, x8 + adcs x5, x5, x7 + umulh x9, x1, x9 + adcs x6, x6, x8 + ldp x10, x11, [sp, #320] + mul x7, x1, x10 + mul x8, x1, x11 + umulh x10, x1, x10 + adcs x7, x7, x9 + umulh x11, x1, x11 + adcs x8, x8, x10 + ldp x12, x13, [sp, #336] + mul x9, x1, x12 + mul x10, x1, x13 + umulh x12, x1, x12 + adcs x9, x9, x11 + umulh x13, x1, x13 + adcs x10, x10, x12 + ldr x14, [sp, #352] + mul x11, x1, x14 + adc x11, x11, x13 + mov x1, #0x9 + ldp x20, x21, [sp, #360] + mvn x20, x20 + mul x0, x1, x20 + umulh x20, x1, x20 + adds x3, x3, x0 + mvn x21, x21 + mul x0, x1, x21 + umulh x21, x1, x21 + adcs x4, x4, x0 + ldp x22, x23, [sp, #376] + mvn x22, x22 + mul x0, x1, x22 + umulh x22, x1, x22 + adcs x5, x5, x0 + mvn x23, x23 + mul x0, x1, x23 + umulh x23, x1, x23 + adcs x6, x6, x0 + ldp x17, x19, [sp, #392] + mvn x17, x17 + mul x0, x1, x17 + umulh x17, x1, x17 + adcs x7, x7, x0 + mvn x19, x19 + mul x0, x1, x19 + umulh x19, x1, x19 + adcs x8, x8, x0 + ldp x2, x16, [sp, #408] + mvn x2, x2 + mul x0, x1, x2 + umulh x2, x1, x2 + adcs x9, x9, x0 + mvn x16, x16 + mul x0, x1, x16 + umulh x16, x1, x16 + adcs x10, x10, x0 + ldr x0, [sp, #424] + eor x0, x0, #0x1ff + mul x0, x1, x0 + adc x11, x11, x0 + adds x4, x4, x20 + adcs x5, x5, x21 + and x15, x4, x5 + adcs x6, x6, x22 + and x15, x15, x6 + adcs x7, x7, x23 + and x15, x15, x7 + adcs x8, x8, x17 + and x15, x15, x8 + adcs x9, x9, x19 + and x15, x15, x9 + adcs x10, x10, x2 + and x15, x15, x10 + adc x11, x11, x16 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [sp, #360] + stp x5, x6, [sp, #376] + stp x7, x8, [sp, #392] + stp x9, x10, [sp, #408] + str x11, [sp, #424] + ldp x5, x6, [sp, #144] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #160] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #176] + ldp x4, x3, [sp, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [sp, #192] + ldp x4, x3, [sp, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [sp, #208] + ldr x4, [sp, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + mov x0, sp + add x1, sp, #0x48 + bl p521_jscalarmul_sqr_p521 + add x0, sp, #0xd8 + add x1, sp, #0x168 + add x2, sp, #0xd8 + bl p521_jscalarmul_mul_p521 + ldp x5, x6, [sp, #144] + ldp x4, x3, [sp, #72] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #160] + ldp x4, x3, [sp, #88] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #176] + ldp x4, x3, [sp, #104] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [sp, #192] + ldp x4, x3, [sp, #120] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [sp, #208] + ldr x4, [sp, #136] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x26, #144] + stp x7, x8, [x26, #160] + stp x9, x10, [x26, #176] + stp x11, x12, [x26, #192] + str x13, [x26, #208] + ldp x6, x7, [sp, #288] + lsl x3, x6, #2 + extr x4, x7, x6, #62 + ldp x8, x9, [sp, #304] + extr x5, x8, x7, #62 + extr x6, x9, x8, #62 + ldp x10, x11, [sp, #320] + extr x7, x10, x9, #62 + extr x8, x11, x10, #62 + ldp x12, x13, [sp, #336] + extr x9, x12, x11, #62 + extr x10, x13, x12, #62 + ldr x14, [sp, #352] + extr x11, x14, x13, #62 + ldp x0, x1, [sp, #360] + mvn x0, x0 + adds x3, x3, x0 + sbcs x4, x4, x1 + ldp x0, x1, [sp, #376] + sbcs x5, x5, x0 + and x15, x4, x5 + sbcs x6, x6, x1 + and x15, x15, x6 + ldp x0, x1, [sp, #392] + sbcs x7, x7, x0 + and x15, x15, x7 + sbcs x8, x8, x1 + and x15, x15, x8 + ldp x0, x1, [sp, #408] + sbcs x9, x9, x0 + and x15, x15, x9 + sbcs x10, x10, x1 + and x15, x15, x10 + ldr x0, [sp, #424] + eor x0, x0, #0x1ff + adc x11, x11, x0 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [x26] + stp x5, x6, [x26, #16] + stp x7, x8, [x26, #32] + stp x9, x10, [x26, #48] + str x11, [x26, #64] + ldp x6, x7, [sp, #216] + lsl x3, x6, #1 + adds x3, x3, x6 + extr x4, x7, x6, #63 + adcs x4, x4, x7 + ldp x8, x9, [sp, #232] + extr x5, x8, x7, #63 + adcs x5, x5, x8 + extr x6, x9, x8, #63 + adcs x6, x6, x9 + ldp x10, x11, [sp, #248] + extr x7, x10, x9, #63 + adcs x7, x7, x10 + extr x8, x11, x10, #63 + adcs x8, x8, x11 + ldp x12, x13, [sp, #264] + extr x9, x12, x11, #63 + adcs x9, x9, x12 + extr x10, x13, x12, #63 + adcs x10, x10, x13 + ldr x14, [sp, #280] + extr x11, x14, x13, #63 + adc x11, x11, x14 + ldp x20, x21, [sp] + mvn x20, x20 + lsl x0, x20, #3 + adds x3, x3, x0 + mvn x21, x21 + extr x0, x21, x20, #61 + adcs x4, x4, x0 + ldp x22, x23, [sp, #16] + mvn x22, x22 + extr x0, x22, x21, #61 + adcs x5, x5, x0 + and x15, x4, x5 + mvn x23, x23 + extr x0, x23, x22, #61 + adcs x6, x6, x0 + and x15, x15, x6 + ldp x20, x21, [sp, #32] + mvn x20, x20 + extr x0, x20, x23, #61 + adcs x7, x7, x0 + and x15, x15, x7 + mvn x21, x21 + extr x0, x21, x20, #61 + adcs x8, x8, x0 + and x15, x15, x8 + ldp x22, x23, [sp, #48] + mvn x22, x22 + extr x0, x22, x21, #61 + adcs x9, x9, x0 + and x15, x15, x9 + mvn x23, x23 + extr x0, x23, x22, #61 + adcs x10, x10, x0 + and x15, x15, x10 + ldr x0, [sp, #64] + eor x0, x0, #0x1ff + extr x0, x0, x23, #61 + adc x11, x11, x0 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [x26, #72] + stp x5, x6, [x26, #88] + stp x7, x8, [x26, #104] + stp x9, x10, [x26, #120] + str x11, [x26, #136] + add sp, sp, #0x200 + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_mul_p521: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + sub sp, sp, #80 + ldr q6, [x2] + ldp x10, x17, [x1, #16] + ldr q4, [x1] + ldr q16, [x2, #32] + ldp x5, x20, [x2, #16] + ldr q2, [x1, #32] + movi v31.2D, #0x00000000ffffffff + uzp2 v17.4S, v6.4S, v6.4S + rev64 v7.4S, v6.4S + ldp x15, x21, [x1] + xtn v25.2S, v6.2D + xtn v22.2S, v4.2D + subs x14, x10, x17 + mul v7.4S, v7.4S, v4.4S + csetm x8, cc + rev64 v3.4S, v16.4S + xtn v1.2S, v16.2D + ldp x13, x16, [x2] + mul x26, x10, x5 + uzp2 v16.4S, v16.4S, v16.4S + uaddlp v26.2D, v7.4S + cneg x4, x14, cc + subs x24, x15, x21 + xtn v5.2S, v2.2D + mul v28.4S, v3.4S, v2.4S + shl v26.2D, v26.2D, #32 + mul x22, x17, x20 + umull v20.2D, v22.2S, v25.2S + uzp2 v6.4S, v4.4S, v4.4S + umull v18.2D, v22.2S, v17.2S + uzp2 v4.4S, v2.4S, v2.4S + cneg x14, x24, cc + csetm x7, cc + umulh x11, x17, x20 + usra v18.2D, v20.2D, #32 + uaddlp v7.2D, v28.4S + subs x19, x16, x13 + umlal v26.2D, v22.2S, v25.2S + cneg x19, x19, cc + shl v28.2D, v7.2D, #32 + umull v7.2D, v5.2S, v1.2S + umull v30.2D, v5.2S, v16.2S + cinv x6, x7, cc + mul x25, x14, x19 + umlal v28.2D, v5.2S, v1.2S + umull v21.2D, v6.2S, v17.2S + umulh x14, x14, x19 + usra v30.2D, v7.2D, #32 + subs x9, x20, x5 + and v29.16B, v18.16B, v31.16B + cinv x23, x8, cc + mov x8, v26.d[1] + cneg x12, x9, cc + usra v21.2D, v18.2D, #32 + umlal v29.2D, v6.2S, v25.2S + mul x24, x4, x12 + umull v18.2D, v4.2S, v16.2S + movi v25.2D, #0x00000000ffffffff + eor x9, x14, x6 + and v7.16B, v30.16B, v25.16B + usra v21.2D, v29.2D, #32 + umulh x7, x10, x5 + usra v18.2D, v30.2D, #32 + umlal v7.2D, v4.2S, v1.2S + mov x19, v21.d[0] + umulh x3, x4, x12 + mov x14, v21.d[1] + usra v18.2D, v7.2D, #32 + adds x4, x8, x19 + mov x8, v26.d[0] + adcs x19, x26, x14 + adcs x14, x22, x7 + adc x12, x11, xzr + adds x11, x4, x8 + adcs x26, x19, x4 + adcs x22, x14, x19 + eor x4, x24, x23 + adcs x14, x12, x14 + eor x7, x25, x6 + adc x25, xzr, x12 + eor x19, x3, x23 + adds x3, x26, x8 + adcs x24, x22, x11 + adcs x12, x14, x26 + adcs x22, x25, x22 + adcs x26, xzr, x14 + adc x14, xzr, x25 + cmn x23, #0x1 + adcs x22, x22, x4 + adcs x19, x26, x19 + adc x25, x14, x23 + subs x14, x21, x17 + cneg x23, x14, cc + csetm x26, cc + subs x4, x20, x16 + cneg x14, x4, cc + cinv x4, x26, cc + cmn x6, #0x1 + adcs x11, x11, x7 + mul x7, x23, x14 + adcs x9, x3, x9 + adcs x26, x24, x6 + umulh x3, x23, x14 + adcs x14, x12, x6 + adcs x22, x22, x6 + adcs x12, x19, x6 + extr x24, x11, x8, #55 + adc x6, x25, x6 + subs x19, x15, x17 + csetm x17, cc + cneg x23, x19, cc + subs x19, x20, x13 + lsl x25, x8, #9 + eor x8, x7, x4 + cneg x20, x19, cc + umulh x7, x23, x20 + cinv x19, x17, cc + subs x17, x15, x10 + csetm x15, cc + stp x25, x24, [sp, #32] + cneg x24, x17, cc + mul x20, x23, x20 + subs x25, x5, x13 + cneg x13, x25, cc + cinv x15, x15, cc + mul x25, x24, x13 + subs x21, x21, x10 + csetm x23, cc + cneg x17, x21, cc + subs x21, x5, x16 + umulh x13, x24, x13 + cinv x10, x23, cc + cneg x23, x21, cc + cmn x4, #0x1 + adcs x14, x14, x8 + eor x21, x3, x4 + adcs x21, x22, x21 + eor x5, x20, x19 + adcs x24, x12, x4 + mul x12, x17, x23 + eor x8, x25, x15 + adc x25, x6, x4 + cmn x15, #0x1 + adcs x6, x9, x8 + ldp x20, x8, [x2, #48] + eor x9, x13, x15 + adcs x4, x26, x9 + umulh x26, x17, x23 + ldp x17, x13, [x1, #48] + adcs x9, x14, x15 + adcs x16, x21, x15 + adcs x14, x24, x15 + eor x21, x7, x19 + mul x23, x17, x20 + adc x24, x25, x15 + cmn x19, #0x1 + adcs x7, x4, x5 + adcs x9, x9, x21 + umulh x3, x13, x8 + adcs x16, x16, x19 + adcs x22, x14, x19 + eor x5, x12, x10 + adc x12, x24, x19 + cmn x10, #0x1 + adcs x19, x7, x5 + eor x14, x26, x10 + mov x7, v28.d[1] + adcs x24, x9, x14 + extr x4, x19, x6, #55 + umulh x15, x17, x20 + mov x14, v18.d[1] + lsr x9, x19, #55 + adcs x5, x16, x10 + mov x16, v18.d[0] + adcs x19, x22, x10 + str x9, [sp, #64] + extr x25, x6, x11, #55 + adc x21, x12, x10 + subs x26, x17, x13 + stp x25, x4, [sp, #48] + stp x19, x21, [sp, #16] + csetm x6, cc + cneg x4, x26, cc + mul x19, x13, x8 + subs x11, x8, x20 + stp x24, x5, [sp] + ldp x21, x10, [x1, #32] + cinv x12, x6, cc + cneg x6, x11, cc + mov x9, v28.d[0] + umulh x25, x4, x6 + adds x22, x7, x16 + ldp x16, x5, [x2, #32] + adcs x14, x23, x14 + adcs x11, x19, x15 + adc x24, x3, xzr + adds x3, x22, x9 + adcs x15, x14, x22 + mul x22, x4, x6 + adcs x6, x11, x14 + adcs x4, x24, x11 + eor x14, x25, x12 + adc x26, xzr, x24 + subs x7, x21, x10 + csetm x23, cc + cneg x19, x7, cc + subs x24, x5, x16 + cneg x11, x24, cc + cinv x7, x23, cc + adds x25, x15, x9 + eor x23, x22, x12 + adcs x22, x6, x3 + mul x24, x19, x11 + adcs x15, x4, x15 + adcs x6, x26, x6 + umulh x19, x19, x11 + adcs x11, xzr, x4 + adc x26, xzr, x26 + cmn x12, #0x1 + adcs x4, x6, x23 + eor x6, x24, x7 + adcs x14, x11, x14 + adc x26, x26, x12 + subs x11, x10, x13 + cneg x12, x11, cc + csetm x11, cc + eor x19, x19, x7 + subs x24, x8, x5 + cinv x11, x11, cc + cneg x24, x24, cc + cmn x7, #0x1 + adcs x3, x3, x6 + mul x23, x12, x24 + adcs x25, x25, x19 + adcs x6, x22, x7 + umulh x19, x12, x24 + adcs x22, x15, x7 + adcs x12, x4, x7 + eor x24, x23, x11 + adcs x4, x14, x7 + adc x26, x26, x7 + eor x19, x19, x11 + subs x14, x21, x17 + cneg x7, x14, cc + csetm x14, cc + subs x23, x20, x16 + cinv x14, x14, cc + cneg x23, x23, cc + cmn x11, #0x1 + adcs x22, x22, x24 + mul x24, x7, x23 + adcs x15, x12, x19 + adcs x4, x4, x11 + adc x19, x26, x11 + umulh x26, x7, x23 + subs x7, x21, x13 + eor x11, x24, x14 + cneg x23, x7, cc + csetm x12, cc + subs x7, x8, x16 + cneg x7, x7, cc + cinv x12, x12, cc + cmn x14, #0x1 + eor x26, x26, x14 + adcs x11, x25, x11 + mul x25, x23, x7 + adcs x26, x6, x26 + adcs x6, x22, x14 + adcs x24, x15, x14 + umulh x23, x23, x7 + adcs x4, x4, x14 + adc x22, x19, x14 + eor x14, x25, x12 + eor x7, x23, x12 + cmn x12, #0x1 + adcs x14, x26, x14 + ldp x19, x25, [x2] + ldp x15, x23, [x2, #16] + adcs x26, x6, x7 + adcs x24, x24, x12 + adcs x7, x4, x12 + adc x4, x22, x12 + subs x19, x19, x16 + ldp x16, x22, [x1] + sbcs x6, x25, x5 + ldp x12, x25, [x1, #16] + sbcs x15, x15, x20 + sbcs x8, x23, x8 + csetm x23, cc + subs x21, x21, x16 + eor x16, x19, x23 + sbcs x19, x10, x22 + eor x22, x6, x23 + eor x8, x8, x23 + sbcs x6, x17, x12 + sbcs x13, x13, x25 + csetm x12, cc + subs x10, x10, x17 + cneg x17, x10, cc + csetm x25, cc + subs x5, x20, x5 + eor x10, x19, x12 + cneg x19, x5, cc + eor x20, x15, x23 + eor x21, x21, x12 + cinv x15, x25, cc + mul x25, x17, x19 + subs x16, x16, x23 + sbcs x5, x22, x23 + eor x6, x6, x12 + sbcs x20, x20, x23 + eor x22, x13, x12 + sbc x8, x8, x23 + subs x21, x21, x12 + umulh x19, x17, x19 + sbcs x10, x10, x12 + sbcs x17, x6, x12 + eor x6, x19, x15 + eor x19, x25, x15 + umulh x25, x17, x20 + sbc x13, x22, x12 + cmn x15, #0x1 + adcs x22, x14, x19 + adcs x19, x26, x6 + ldp x6, x26, [sp] + adcs x14, x24, x15 + umulh x24, x21, x16 + adcs x7, x7, x15 + adc x15, x4, x15 + adds x4, x9, x6 + eor x9, x23, x12 + adcs x12, x3, x26 + stp x4, x12, [sp] + ldp x4, x26, [sp, #16] + umulh x12, x10, x5 + ldp x6, x23, [sp, #32] + adcs x3, x11, x4 + mul x4, x13, x8 + adcs x26, x22, x26 + ldp x22, x11, [sp, #48] + adcs x6, x19, x6 + stp x3, x26, [sp, #16] + mul x26, x10, x5 + adcs x14, x14, x23 + stp x6, x14, [sp, #32] + ldr x6, [sp, #64] + adcs x22, x7, x22 + adcs x14, x15, x11 + mul x11, x17, x20 + adc x19, x6, xzr + stp x22, x14, [sp, #48] + adds x14, x26, x24 + str x19, [sp, #64] + umulh x19, x13, x8 + adcs x7, x11, x12 + adcs x22, x4, x25 + mul x6, x21, x16 + adc x19, x19, xzr + subs x11, x17, x13 + cneg x12, x11, cc + csetm x11, cc + subs x24, x8, x20 + cinv x11, x11, cc + cneg x24, x24, cc + adds x4, x14, x6 + adcs x14, x7, x14 + mul x3, x12, x24 + adcs x7, x22, x7 + adcs x22, x19, x22 + umulh x12, x12, x24 + adc x24, xzr, x19 + adds x19, x14, x6 + eor x3, x3, x11 + adcs x26, x7, x4 + adcs x14, x22, x14 + adcs x25, x24, x7 + adcs x23, xzr, x22 + eor x7, x12, x11 + adc x12, xzr, x24 + subs x22, x21, x10 + cneg x24, x22, cc + csetm x22, cc + subs x15, x5, x16 + cinv x22, x22, cc + cneg x15, x15, cc + cmn x11, #0x1 + adcs x3, x25, x3 + mul x25, x24, x15 + adcs x23, x23, x7 + adc x11, x12, x11 + subs x7, x10, x13 + umulh x15, x24, x15 + cneg x12, x7, cc + csetm x7, cc + eor x24, x25, x22 + eor x25, x15, x22 + cmn x22, #0x1 + adcs x24, x4, x24 + adcs x19, x19, x25 + adcs x15, x26, x22 + adcs x4, x14, x22 + adcs x26, x3, x22 + adcs x25, x23, x22 + adc x23, x11, x22 + subs x14, x21, x17 + cneg x3, x14, cc + csetm x11, cc + subs x14, x8, x5 + cneg x14, x14, cc + cinv x7, x7, cc + subs x13, x21, x13 + cneg x21, x13, cc + csetm x13, cc + mul x22, x12, x14 + subs x8, x8, x16 + cinv x13, x13, cc + umulh x14, x12, x14 + cneg x12, x8, cc + subs x8, x20, x16 + cneg x8, x8, cc + cinv x16, x11, cc + eor x22, x22, x7 + cmn x7, #0x1 + eor x14, x14, x7 + adcs x4, x4, x22 + mul x11, x3, x8 + adcs x22, x26, x14 + adcs x14, x25, x7 + eor x25, x24, x9 + adc x26, x23, x7 + umulh x7, x3, x8 + subs x17, x10, x17 + cneg x24, x17, cc + eor x3, x11, x16 + csetm x11, cc + subs x20, x20, x5 + cneg x5, x20, cc + cinv x11, x11, cc + cmn x16, #0x1 + mul x17, x21, x12 + eor x8, x7, x16 + adcs x10, x19, x3 + and x19, x9, #0x1ff + adcs x20, x15, x8 + umulh x15, x21, x12 + eor x12, x10, x9 + eor x8, x6, x9 + adcs x6, x4, x16 + adcs x4, x22, x16 + adcs x21, x14, x16 + adc x7, x26, x16 + mul x10, x24, x5 + cmn x13, #0x1 + ldp x3, x14, [x1] + eor x17, x17, x13 + umulh x5, x24, x5 + adcs x20, x20, x17 + eor x17, x15, x13 + adcs x16, x6, x17 + eor x22, x10, x11 + adcs x23, x4, x13 + extr x10, x14, x3, #52 + and x26, x3, #0xfffffffffffff + adcs x24, x21, x13 + and x15, x10, #0xfffffffffffff + adc x6, x7, x13 + cmn x11, #0x1 + adcs x17, x20, x22 + eor x4, x5, x11 + ldp x21, x10, [sp] + adcs x7, x16, x4 + eor x16, x17, x9 + eor x13, x7, x9 + ldp x3, x17, [sp, #16] + adcs x7, x23, x11 + eor x23, x7, x9 + ldp x5, x22, [sp, #32] + adcs x7, x24, x11 + adc x24, x6, x11 + ldr x6, [x2, #64] + adds x20, x8, x21 + lsl x11, x20, #9 + eor x4, x7, x9 + orr x7, x11, x19 + eor x8, x24, x9 + adcs x11, x25, x10 + mul x26, x6, x26 + ldp x19, x24, [sp, #48] + adcs x12, x12, x3 + adcs x16, x16, x17 + adcs x9, x13, x5 + ldr x25, [sp, #64] + extr x20, x11, x20, #55 + adcs x13, x23, x22 + adcs x4, x4, x19 + extr x23, x12, x11, #55 + adcs x8, x8, x24 + adc x11, x25, xzr + adds x21, x9, x21 + extr x9, x16, x12, #55 + lsr x12, x16, #55 + adcs x10, x13, x10 + mul x15, x6, x15 + adcs x13, x4, x3 + ldp x16, x4, [x2] + ldr x3, [x1, #64] + adcs x17, x8, x17 + adcs x5, x5, x7 + adcs x20, x22, x20 + adcs x8, x19, x23 + and x22, x16, #0xfffffffffffff + ldp x19, x7, [x1, #16] + adcs x9, x24, x9 + extr x24, x4, x16, #52 + adc x16, x12, x25 + mul x22, x3, x22 + and x25, x24, #0xfffffffffffff + extr x14, x19, x14, #40 + and x12, x14, #0xfffffffffffff + extr x23, x7, x19, #28 + ldp x19, x24, [x2, #16] + mul x14, x3, x25 + and x23, x23, #0xfffffffffffff + add x22, x26, x22 + lsl x11, x11, #48 + lsr x26, x22, #52 + lsl x25, x22, #12 + mul x22, x6, x12 + extr x12, x19, x4, #40 + add x4, x15, x14 + mul x15, x6, x23 + add x4, x4, x26 + extr x23, x24, x19, #28 + ldp x14, x19, [x1, #32] + and x26, x12, #0xfffffffffffff + extr x12, x4, x25, #12 + and x25, x23, #0xfffffffffffff + adds x21, x21, x12 + mul x12, x3, x26 + extr x23, x14, x7, #16 + and x23, x23, #0xfffffffffffff + mul x7, x3, x25 + ldp x25, x26, [x2, #32] + add x12, x22, x12 + extr x22, x19, x14, #56 + mul x23, x6, x23 + lsr x14, x14, #4 + extr x24, x25, x24, #16 + add x7, x15, x7 + and x15, x24, #0xfffffffffffff + and x22, x22, #0xfffffffffffff + lsr x24, x4, #52 + mul x15, x3, x15 + and x14, x14, #0xfffffffffffff + add x12, x12, x24 + lsl x24, x4, #12 + lsr x4, x12, #52 + extr x24, x12, x24, #24 + adcs x10, x10, x24 + lsl x24, x12, #12 + add x12, x7, x4 + mul x22, x6, x22 + add x4, x23, x15 + extr x7, x12, x24, #36 + adcs x13, x13, x7 + lsl x15, x12, #12 + add x7, x4, x11 + lsr x24, x12, #52 + ldp x23, x11, [x2, #48] + add x4, x7, x24 + mul x12, x6, x14 + extr x7, x26, x25, #56 + extr x14, x4, x15, #48 + and x2, x7, #0xfffffffffffff + extr x24, x11, x23, #32 + ldp x15, x7, [x1, #48] + and x1, x24, #0xfffffffffffff + lsr x24, x4, #52 + mul x2, x3, x2 + extr x26, x23, x26, #44 + lsr x23, x25, #4 + and x23, x23, #0xfffffffffffff + and x25, x26, #0xfffffffffffff + extr x26, x7, x15, #32 + extr x19, x15, x19, #44 + mul x23, x3, x23 + and x15, x26, #0xfffffffffffff + lsl x26, x4, #12 + and x4, x19, #0xfffffffffffff + lsr x11, x11, #20 + mul x19, x6, x4 + adcs x17, x17, x14 + add x14, x22, x2 + add x22, x12, x23 + lsr x7, x7, #20 + add x22, x22, x24 + extr x2, x22, x26, #60 + mul x24, x3, x25 + lsr x22, x22, #52 + add x14, x14, x22 + lsl x22, x2, #8 + extr x22, x14, x22, #8 + lsl x2, x14, #12 + mul x1, x3, x1 + adcs x12, x5, x22 + mul x5, x6, x15 + and x26, x10, x13 + and x4, x26, x17 + add x23, x19, x24 + lsr x14, x14, #52 + mul x22, x3, x11 + add x11, x23, x14 + extr x25, x11, x2, #20 + lsl x19, x11, #12 + adcs x25, x20, x25 + and x14, x4, x12 + add x1, x5, x1 + and x14, x14, x25 + mul x15, x6, x7 + add x26, x15, x22 + mul x6, x6, x3 + lsr x22, x11, #52 + add x4, x1, x22 + lsr x1, x4, #52 + extr x3, x4, x19, #32 + lsl x15, x4, #12 + add x7, x26, x1 + adcs x23, x8, x3 + extr x20, x7, x15, #44 + and x3, x14, x23 + lsr x19, x7, #44 + adcs x7, x9, x20 + add x11, x6, x19 + adc x4, x16, x11 + lsr x14, x4, #9 + cmp xzr, xzr + and x15, x3, x7 + orr x3, x4, #0xfffffffffffffe00 + adcs xzr, x21, x14 + adcs xzr, x15, xzr + adcs xzr, x3, xzr + adcs x11, x21, x14 + and x14, x11, #0x1ff + adcs x1, x10, xzr + extr x10, x1, x11, #9 + str x14, [x0, #64] + adcs x14, x13, xzr + extr x11, x14, x1, #9 + adcs x1, x17, xzr + extr x4, x1, x14, #9 + stp x10, x11, [x0] + adcs x11, x12, xzr + extr x14, x11, x1, #9 + adcs x10, x25, xzr + extr x11, x10, x11, #9 + stp x4, x14, [x0, #16] + adcs x14, x23, xzr + extr x10, x14, x10, #9 + adcs x1, x7, xzr + stp x11, x10, [x0, #32] + extr x14, x1, x14, #9 + adc x10, x3, xzr + extr x26, x10, x1, #9 + stp x14, x26, [x0, #48] + add sp, sp, #80 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_sqr_p521: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + ldr q23, [x1, #32] + ldp x9, x2, [x1, #32] + ldr q16, [x1, #32] + ldr q20, [x1, #48] + ldp x6, x13, [x1, #48] + rev64 v2.4S, v23.4S + mul x14, x9, x2 + ldr q31, [x1, #48] + subs x22, x9, x2 + uzp2 v26.4S, v23.4S, v23.4S + mul v30.4S, v2.4S, v16.4S + xtn v0.2S, v20.2D + csetm x12, cc + xtn v21.2S, v16.2D + xtn v23.2S, v23.2D + umulh x10, x9, x6 + rev64 v27.4S, v31.4S + umull v2.2D, v21.2S, v26.2S + cneg x23, x22, cc + uaddlp v25.2D, v30.4S + umull v18.2D, v21.2S, v23.2S + mul x22, x9, x6 + mul v6.4S, v27.4S, v20.4S + uzp2 v17.4S, v20.4S, v20.4S + shl v20.2D, v25.2D, #32 + uzp2 v27.4S, v31.4S, v31.4S + mul x16, x2, x13 + umlal v20.2D, v21.2S, v23.2S + usra v2.2D, v18.2D, #32 + adds x8, x22, x10 + umull v25.2D, v17.2S, v27.2S + xtn v31.2S, v31.2D + movi v1.2D, #0xffffffff + adc x3, x10, xzr + umulh x21, x2, x13 + uzp2 v21.4S, v16.4S, v16.4S + umull v18.2D, v0.2S, v27.2S + subs x19, x13, x6 + and v7.16B, v2.16B, v1.16B + umull v27.2D, v0.2S, v31.2S + cneg x20, x19, cc + movi v30.2D, #0xffffffff + umull v16.2D, v21.2S, v26.2S + umlal v7.2D, v21.2S, v23.2S + mul x19, x23, x20 + cinv x7, x12, cc + uaddlp v6.2D, v6.4S + eor x12, x19, x7 + adds x11, x8, x16 + umulh x10, x23, x20 + ldr q1, [x1] + usra v16.2D, v2.2D, #32 + adcs x19, x3, x21 + shl v2.2D, v6.2D, #32 + adc x20, x21, xzr + adds x17, x19, x16 + usra v18.2D, v27.2D, #32 + adc x19, x20, xzr + cmn x7, #0x1 + umlal v2.2D, v0.2S, v31.2S + umulh x16, x9, x2 + adcs x8, x11, x12 + usra v16.2D, v7.2D, #32 + ldr x12, [x1, #64] + eor x20, x10, x7 + umulh x10, x6, x13 + mov x23, v2.d[0] + mov x3, v2.d[1] + adcs x21, x17, x20 + usra v25.2D, v18.2D, #32 + and v23.16B, v18.16B, v30.16B + adc x7, x19, x7 + adds x22, x22, x22 + ldr q7, [x1, #16] + adcs x17, x8, x8 + umlal v23.2D, v17.2S, v31.2S + mov x19, v16.d[0] + mul x11, x12, x12 + ldr q4, [x1] + usra v25.2D, v23.2D, #32 + add x5, x12, x12 + adcs x15, x21, x21 + ldr q28, [x1] + mov x12, v20.d[1] + adcs x24, x7, x7 + mov x21, v16.d[1] + adc x4, xzr, xzr + adds x19, x19, x14 + ldr q18, [x1, #16] + xtn v26.2S, v1.2D + adcs x8, x12, x16 + adc x21, x21, xzr + adds x7, x19, x14 + xtn v23.2S, v7.2D + rev64 v21.4S, v28.4S + adcs x12, x8, x16 + ldp x20, x19, [x1] + mov x16, v25.d[1] + xtn v22.2S, v28.2D + adc x14, x21, xzr + adds x8, x22, x12 + uzp2 v24.4S, v28.4S, v28.4S + rev64 v28.4S, v18.4S + mul x12, x6, x13 + mul v16.4S, v21.4S, v1.4S + shrn v31.2S, v7.2D, #32 + adcs x22, x17, x14 + mov x14, v25.d[0] + and x21, x20, #0xfffffffffffff + umull v17.2D, v26.2S, v24.2S + ldr q2, [x1, #32] + adcs x17, x15, xzr + ldr q30, [x1, #48] + umull v7.2D, v26.2S, v22.2S + adcs x15, x24, xzr + ldr q0, [x1, #16] + movi v6.2D, #0xffffffff + adc x4, x4, xzr + adds x14, x14, x12 + uzp1 v27.4S, v18.4S, v4.4S + uzp2 v19.4S, v1.4S, v1.4S + adcs x24, x3, x10 + mul x3, x5, x21 + umull v29.2D, v23.2S, v31.2S + ldr q5, [x1] + adc x21, x16, xzr + adds x16, x14, x12 + extr x12, x19, x20, #52 + umull v18.2D, v19.2S, v24.2S + adcs x24, x24, x10 + and x10, x12, #0xfffffffffffff + ldp x14, x12, [x1, #16] + usra v17.2D, v7.2D, #32 + adc x21, x21, xzr + adds x23, x23, x17 + mul x17, x5, x10 + shl v21.2D, v29.2D, #33 + lsl x10, x3, #12 + lsr x1, x3, #52 + rev64 v29.4S, v2.4S + uaddlp v25.2D, v16.4S + add x17, x17, x1 + adcs x16, x16, x15 + extr x3, x14, x19, #40 + mov x15, v20.d[0] + extr x10, x17, x10, #12 + and x3, x3, #0xfffffffffffff + shl v3.2D, v25.2D, #32 + and v6.16B, v17.16B, v6.16B + mul x1, x5, x3 + usra v18.2D, v17.2D, #32 + adcs x3, x24, x4 + extr x4, x12, x14, #28 + umlal v6.2D, v19.2S, v22.2S + xtn v20.2S, v2.2D + umlal v3.2D, v26.2S, v22.2S + movi v26.2D, #0xffffffff + lsr x24, x17, #52 + and x4, x4, #0xfffffffffffff + uzp2 v19.4S, v2.4S, v2.4S + add x1, x1, x24 + mul x24, x5, x4 + lsl x4, x17, #12 + xtn v24.2S, v5.2D + extr x17, x1, x4, #24 + adc x21, x21, xzr + umlal v21.2D, v23.2S, v23.2S + adds x4, x15, x10 + lsl x10, x1, #12 + adcs x15, x7, x17 + mul v23.4S, v28.4S, v4.4S + and x7, x4, #0x1ff + lsr x17, x1, #52 + umulh x1, x19, x12 + uzp2 v17.4S, v5.4S, v5.4S + extr x4, x15, x4, #9 + add x24, x24, x17 + mul v29.4S, v29.4S, v5.4S + extr x17, x24, x10, #36 + extr x10, x9, x12, #16 + uzp1 v28.4S, v4.4S, v4.4S + adcs x17, x8, x17 + and x8, x10, #0xfffffffffffff + umull v16.2D, v24.2S, v20.2S + extr x10, x17, x15, #9 + mul x15, x5, x8 + stp x4, x10, [x0] + lsl x4, x24, #12 + lsr x8, x9, #4 + uaddlp v4.2D, v23.4S + and x8, x8, #0xfffffffffffff + umull v23.2D, v24.2S, v19.2S + mul x8, x5, x8 + extr x10, x2, x9, #56 + lsr x24, x24, #52 + and x10, x10, #0xfffffffffffff + add x15, x15, x24 + extr x4, x15, x4, #48 + mul x24, x5, x10 + lsr x10, x15, #52 + usra v23.2D, v16.2D, #32 + add x10, x8, x10 + shl v4.2D, v4.2D, #32 + adcs x22, x22, x4 + extr x4, x6, x2, #44 + lsl x15, x15, #12 + lsr x8, x10, #52 + extr x15, x10, x15, #60 + and x10, x4, #0xfffffffffffff + umlal v4.2D, v28.2S, v27.2S + add x8, x24, x8 + extr x4, x13, x6, #32 + mul x24, x5, x10 + uzp2 v16.4S, v30.4S, v30.4S + lsl x10, x15, #8 + rev64 v28.4S, v30.4S + and x15, x4, #0xfffffffffffff + extr x4, x8, x10, #8 + mul x10, x5, x15 + lsl x15, x8, #12 + adcs x23, x23, x4 + lsr x4, x8, #52 + lsr x8, x13, #20 + add x4, x24, x4 + mul x8, x5, x8 + lsr x24, x4, #52 + extr x15, x4, x15, #20 + lsl x4, x4, #12 + add x10, x10, x24 + adcs x15, x16, x15 + extr x4, x10, x4, #32 + umulh x5, x20, x14 + adcs x3, x3, x4 + usra v18.2D, v6.2D, #32 + lsl x16, x10, #12 + extr x24, x15, x23, #9 + lsr x10, x10, #52 + uzp2 v27.4S, v0.4S, v0.4S + add x8, x8, x10 + extr x10, x3, x15, #9 + extr x4, x22, x17, #9 + and v25.16B, v23.16B, v26.16B + lsr x17, x8, #44 + extr x15, x8, x16, #44 + extr x16, x23, x22, #9 + xtn v7.2S, v30.2D + mov x8, v4.d[0] + stp x24, x10, [x0, #32] + uaddlp v30.2D, v29.4S + stp x4, x16, [x0, #16] + umulh x24, x20, x19 + adcs x15, x21, x15 + adc x16, x11, x17 + subs x11, x20, x19 + xtn v5.2S, v0.2D + csetm x17, cc + extr x3, x15, x3, #9 + mov x22, v4.d[1] + cneg x21, x11, cc + subs x10, x12, x14 + mul v31.4S, v28.4S, v0.4S + cneg x10, x10, cc + cinv x11, x17, cc + shl v4.2D, v30.2D, #32 + umull v28.2D, v5.2S, v16.2S + extr x23, x16, x15, #9 + adds x4, x8, x5 + mul x17, x21, x10 + umull v22.2D, v5.2S, v7.2S + adc x15, x5, xzr + adds x4, x4, x22 + uaddlp v2.2D, v31.4S + lsr x5, x16, #9 + adcs x16, x15, x1 + mov x15, v18.d[0] + adc x1, x1, xzr + umulh x10, x21, x10 + adds x22, x16, x22 + umlal v4.2D, v24.2S, v20.2S + umull v30.2D, v27.2S, v16.2S + stp x3, x23, [x0, #48] + add x3, x7, x5 + adc x16, x1, xzr + usra v28.2D, v22.2D, #32 + mul x23, x20, x19 + eor x1, x17, x11 + cmn x11, #0x1 + mov x17, v18.d[1] + umull v18.2D, v17.2S, v19.2S + adcs x7, x4, x1 + eor x1, x10, x11 + umlal v25.2D, v17.2S, v20.2S + movi v16.2D, #0xffffffff + adcs x22, x22, x1 + usra v18.2D, v23.2D, #32 + umulh x4, x14, x14 + adc x1, x16, x11 + adds x10, x8, x8 + shl v23.2D, v2.2D, #32 + str x3, [x0, #64] + adcs x5, x7, x7 + and v16.16B, v28.16B, v16.16B + usra v30.2D, v28.2D, #32 + adcs x7, x22, x22 + mov x21, v3.d[1] + adcs x11, x1, x1 + umlal v16.2D, v27.2S, v7.2S + adc x22, xzr, xzr + adds x16, x15, x23 + mul x8, x14, x12 + umlal v23.2D, v5.2S, v7.2S + usra v18.2D, v25.2D, #32 + umulh x15, x14, x12 + adcs x21, x21, x24 + usra v30.2D, v16.2D, #32 + adc x1, x17, xzr + adds x3, x16, x23 + adcs x21, x21, x24 + adc x1, x1, xzr + adds x24, x10, x21 + umulh x21, x12, x12 + adcs x16, x5, x1 + adcs x10, x7, xzr + mov x17, v21.d[1] + adcs x23, x11, xzr + adc x5, x22, xzr + adds x1, x4, x8 + adcs x22, x17, x15 + ldp x17, x4, [x0] + mov x11, v21.d[0] + adc x21, x21, xzr + adds x1, x1, x8 + adcs x15, x22, x15 + adc x8, x21, xzr + adds x22, x11, x10 + mov x21, v3.d[0] + adcs x11, x1, x23 + ldp x1, x10, [x0, #16] + adcs x15, x15, x5 + adc x7, x8, xzr + adds x8, x17, x21 + mov x23, v4.d[1] + ldp x5, x21, [x0, #32] + adcs x17, x4, x3 + ldr x4, [x0, #64] + mov x3, v18.d[0] + adcs x24, x1, x24 + stp x8, x17, [x0] + adcs x17, x10, x16 + ldp x1, x16, [x0, #48] + adcs x5, x5, x22 + adcs x8, x21, x11 + stp x5, x8, [x0, #32] + adcs x1, x1, x15 + mov x15, v23.d[1] + adcs x21, x16, x7 + stp x1, x21, [x0, #48] + adc x10, x4, xzr + subs x7, x14, x12 + mov x16, v18.d[1] + cneg x5, x7, cc + csetm x4, cc + subs x11, x13, x6 + mov x8, v23.d[0] + cneg x7, x11, cc + cinv x21, x4, cc + mov x11, v30.d[0] + adds x4, x23, x3 + mul x22, x5, x7 + mov x23, v30.d[1] + adcs x8, x8, x16 + adcs x16, x15, x11 + adc x11, x23, xzr + umulh x3, x5, x7 + stp x24, x17, [x0, #16] + mov x5, v4.d[0] + subs x15, x20, x19 + cneg x7, x15, cc + str x10, [x0, #64] + csetm x1, cc + subs x24, x2, x9 + cneg x17, x24, cc + cinv x15, x1, cc + adds x23, x4, x5 + umulh x1, x7, x17 + adcs x24, x8, x4 + adcs x10, x16, x8 + eor x8, x22, x21 + adcs x16, x11, x16 + mul x22, x7, x17 + eor x17, x1, x15 + adc x1, xzr, x11 + adds x11, x24, x5 + eor x7, x3, x21 + adcs x3, x10, x23 + adcs x24, x16, x24 + adcs x4, x1, x10 + eor x10, x22, x15 + adcs x16, xzr, x16 + adc x1, xzr, x1 + cmn x21, #0x1 + adcs x8, x4, x8 + adcs x22, x16, x7 + adc x7, x1, x21 + subs x21, x19, x12 + csetm x4, cc + cneg x1, x21, cc + subs x21, x13, x2 + cinv x16, x4, cc + cneg x4, x21, cc + cmn x15, #0x1 + adcs x21, x23, x10 + mul x23, x1, x4 + adcs x11, x11, x17 + adcs x3, x3, x15 + umulh x1, x1, x4 + adcs x24, x24, x15 + adcs x8, x8, x15 + adcs x22, x22, x15 + eor x17, x23, x16 + adc x15, x7, x15 + subs x7, x20, x14 + cneg x7, x7, cc + csetm x4, cc + subs x10, x20, x12 + cneg x23, x10, cc + csetm x10, cc + subs x12, x6, x9 + cinv x20, x4, cc + cneg x12, x12, cc + cmn x16, #0x1 + eor x1, x1, x16 + adcs x17, x24, x17 + mul x4, x7, x12 + adcs x8, x8, x1 + umulh x1, x7, x12 + adcs x24, x22, x16 + adc x7, x15, x16 + subs x12, x13, x9 + cneg x12, x12, cc + cinv x13, x10, cc + subs x19, x19, x14 + mul x9, x23, x12 + cneg x19, x19, cc + csetm x10, cc + eor x16, x1, x20 + subs x22, x6, x2 + umulh x12, x23, x12 + eor x1, x4, x20 + cinv x4, x10, cc + cneg x22, x22, cc + cmn x20, #0x1 + adcs x15, x11, x1 + eor x6, x12, x13 + adcs x10, x3, x16 + adcs x17, x17, x20 + eor x23, x9, x13 + adcs x2, x8, x20 + mul x11, x19, x22 + adcs x24, x24, x20 + adc x7, x7, x20 + cmn x13, #0x1 + adcs x3, x10, x23 + umulh x22, x19, x22 + adcs x17, x17, x6 + eor x12, x22, x4 + extr x22, x15, x21, #63 + adcs x8, x2, x13 + extr x21, x21, x5, #63 + ldp x16, x23, [x0] + adcs x20, x24, x13 + eor x1, x11, x4 + adc x6, x7, x13 + cmn x4, #0x1 + ldp x2, x7, [x0, #16] + adcs x1, x3, x1 + extr x19, x1, x15, #63 + adcs x14, x17, x12 + extr x1, x14, x1, #63 + lsl x17, x5, #1 + adcs x8, x8, x4 + extr x12, x8, x14, #8 + ldp x15, x11, [x0, #32] + adcs x9, x20, x4 + adc x3, x6, x4 + adds x16, x12, x16 + extr x6, x9, x8, #8 + ldp x14, x12, [x0, #48] + extr x8, x3, x9, #8 + adcs x20, x6, x23 + ldr x24, [x0, #64] + lsr x6, x3, #8 + adcs x8, x8, x2 + and x2, x1, #0x1ff + and x1, x20, x8 + adcs x4, x6, x7 + adcs x3, x17, x15 + and x1, x1, x4 + adcs x9, x21, x11 + and x1, x1, x3 + adcs x6, x22, x14 + and x1, x1, x9 + and x21, x1, x6 + adcs x14, x19, x12 + adc x1, x24, x2 + cmp xzr, xzr + orr x12, x1, #0xfffffffffffffe00 + lsr x1, x1, #9 + adcs xzr, x16, x1 + and x21, x21, x14 + adcs xzr, x21, xzr + adcs xzr, x12, xzr + adcs x21, x16, x1 + adcs x1, x20, xzr + adcs x19, x8, xzr + stp x21, x1, [x0] + adcs x1, x4, xzr + adcs x21, x3, xzr + stp x19, x1, [x0, #16] + adcs x1, x9, xzr + stp x21, x1, [x0, #32] + adcs x21, x6, xzr + adcs x1, x14, xzr + stp x21, x1, [x0, #48] + adc x1, x12, xzr + and x1, x1, #0x1ff + str x1, [x0, #64] + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S new file mode 100644 index 0000000000..89e0408d8b --- /dev/null +++ b/third_party/s2n-bignum/arm/p521/p521_jscalarmul_alt.S @@ -0,0 +1,2102 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Jacobian form scalar multiplication for P-521 +// Input scalar[9], point[27]; output res[27] +// +// extern void p521_jscalarmul_alt +// (uint64_t res[static 27], +// uint64_t scalar[static 9], +// uint64_t point[static 27]); +// +// This function is a variant of its affine point version p521_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// a triple (x,y,z) representing the affine point (x/z^2,y/z^3) when +// z is nonzero or the point at infinity (group identity) if z = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-521, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_521) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 +#define JACSIZE (3*NUMSIZE) + +// Safe copies of input res and additional values in variables. + +#define tabup x15 +#define bf x16 +#define sgn x17 +#define j x19 +#define res x20 + +// Intermediate variables on the stack. +// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +// Round up to maintain stack alignment + +#define NSPACE #(55*NUMSIZE+8) + +#define selectblock(I) \ + cmp bf, #(1*I); \ + ldp x10, x11, [tabup]; \ + csel x0, x10, x0, eq; \ + csel x1, x11, x1, eq; \ + ldp x10, x11, [tabup, #16]; \ + csel x2, x10, x2, eq; \ + csel x3, x11, x3, eq; \ + ldp x10, x11, [tabup, #32]; \ + csel x4, x10, x4, eq; \ + csel x5, x11, x5, eq; \ + ldp x10, x11, [tabup, #48]; \ + csel x6, x10, x6, eq; \ + csel x7, x11, x7, eq; \ + ldr x10, [tabup, #64]; \ + csel x8, x10, x8, eq; \ + add tabup, tabup, #JACSIZE + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p521_jscalarmul_alt): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" input argument; others get processed early. + + mov res, x0 + +// Reduce the input scalar mod n_521 and store it to "scalarb". + + mov x19, x2 + add x0, scalarb + bl p521_jscalarmul_alt_bignum_mod_n521_9 + mov x2, x19 + +// Set the tab[0] table entry to the input point = 1 * P, but also +// reduce all coordinates modulo p. In principle we assume reduction +// as a precondition, but this reduces the scope for surprise, e.g. +// making sure that any input with z = 0 is treated as zero, even +// if the other coordinates are not in fact reduced. + + add x0, tab + mov x1, x19 + bl p521_jscalarmul_alt_bignum_mod_p521_9 + + add x0, tab+NUMSIZE + add x1, x19, #NUMSIZE + bl p521_jscalarmul_alt_bignum_mod_p521_9 + + add x0, tab+2*NUMSIZE + add x1, x19, #(2*NUMSIZE) + bl p521_jscalarmul_alt_bignum_mod_p521_9 + +// If bit 520 of the scalar is set, then negate the scalar mod n_521, +// i.e. do scalar |-> n_521 - scalar, and also the point to compensate +// by negating its y coordinate. This further step is not needed by +// the indexing scheme (the top window is only a couple of bits either +// way), but is convenient to exclude a problem with the specific value +// scalar = n_521 - 18, where the last Jacobian addition is of the form +// (n_521 - 9) * P + -(9 * P) and hence is a degenerate doubling case. + + ldp x0, x1, [scalarb] + movbig(x10, #0xbb6f, #0xb71e, #0x9138, #0x6409) + subs x10, x10, x0 + movbig(x11, #0x3bb5, #0xc9b8, #0x899c, #0x47ae) + sbcs x11, x11, x1 + ldp x2, x3, [scalarb+16] + movbig(x12, #0x7fcc, #0x0148, #0xf709, #0xa5d0) + sbcs x12, x12, x2 + movbig(x13, #0x5186, #0x8783, #0xbf2f, #0x966b) + sbcs x13, x13, x3 + ldp x4, x5, [scalarb+32] + mov x14, 0xfffffffffffffffa + sbcs x14, x14, x4 + mov x15, 0xffffffffffffffff + sbcs x15, x15, x5 + ldp x6, x7, [scalarb+48] + mov x16, 0xffffffffffffffff + sbcs x16, x16, x6 + mov x17, 0xffffffffffffffff + sbcs x17, x17, x7 + ldr x8, [scalarb+64] + mov x19, 0x00000000000001ff + sbc x19, x19, x8 + tst x8, 0x100 + csetm x9, ne + csel x0, x10, x0, ne + csel x1, x11, x1, ne + csel x2, x12, x2, ne + csel x3, x13, x3, ne + csel x4, x14, x4, ne + csel x5, x15, x5, ne + csel x6, x16, x6, ne + csel x7, x17, x7, ne + csel x8, x19, x8, ne + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + + add tabup, tab + ldp x0, x1, [tabup, #NUMSIZE] + ldp x2, x3, [tabup, #NUMSIZE+16] + ldp x4, x5, [tabup, #NUMSIZE+32] + ldp x6, x7, [tabup, #NUMSIZE+48] + ldr x8, [tabup, #NUMSIZE+64] + orr x10, x0, x1 + orr x11, x2, x3 + orr x12, x4, x5 + orr x13, x6, x7 + orr x10, x10, x11 + orr x12, x12, x13 + orr x12, x12, x8 + orr x10, x10, x12 + cmp x10, xzr + csel x9, x9, xzr, ne + eor x0, x0, x9 + eor x1, x1, x9 + eor x2, x2, x9 + eor x3, x3, x9 + eor x4, x4, x9 + eor x5, x5, x9 + eor x6, x6, x9 + eor x7, x7, x9 + and x9, x9, #0x1FF + eor x8, x8, x9 + stp x0, x1, [tabup, #NUMSIZE] + stp x2, x3, [tabup, #NUMSIZE+16] + stp x4, x5, [tabup, #NUMSIZE+32] + stp x6, x7, [tabup, #NUMSIZE+48] + str x8, [tabup, #NUMSIZE+64] + +// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P + + add x0, tab+JACSIZE*1 + add x1, tab + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*2 + add x1, tab+JACSIZE*1 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*3 + add x1, tab+JACSIZE*1 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*4 + add x1, tab+JACSIZE*3 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*5 + add x1, tab+JACSIZE*2 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*6 + add x1, tab+JACSIZE*5 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*7 + add x1, tab+JACSIZE*3 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*8 + add x1, tab+JACSIZE*7 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*9 + add x1, tab+JACSIZE*4 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*10 + add x1, tab+JACSIZE*9 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*11 + add x1, tab+JACSIZE*5 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*12 + add x1, tab+JACSIZE*11 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*13 + add x1, tab+JACSIZE*6 + bl p521_jscalarmul_alt_jdouble + + add x0, tab+JACSIZE*14 + add x1, tab+JACSIZE*13 + add x2, tab + bl p521_jscalarmul_alt_jadd + + add x0, tab+JACSIZE*15 + add x1, tab+JACSIZE*7 + bl p521_jscalarmul_alt_jdouble + +// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed +// digits. The digits of the constant, in lowest-to-highest order, are as +// follows; they are generated dynamically since none is a simple ARM load. +// +// 0x0842108421084210 +// 0x1084210842108421 +// 0x2108421084210842 +// 0x4210842108421084 +// 0x8421084210842108 +// 0x0842108421084210 +// 0x1084210842108421 +// 0x2108421084210842 +// 0x0000000000000084 + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + ldp x6, x7, [scalarb+48] + ldr x8, [scalarb+64] + + movbig(x10, #0x1084, #0x2108, #0x4210, #0x8421) + adds x0, x0, x10, lsr #1 + adcs x1, x1, x10 + lsl x10, x10, #1 + adcs x2, x2, x10 + lsl x10, x10, #1 + adcs x3, x3, x10 + lsl x10, x10, #1 + adcs x4, x4, x10 + lsr x11, x10, #4 + adcs x5, x5, x11 + lsr x10, x10, #3 + adcs x6, x6, x10 + lsl x10, x10, #1 + adcs x7, x7, x10 + lsl x10, x10, #1 + and x10, x10, #0xFF + adc x8, x8, x10 + +// Because of the initial reduction the top bitfield (>= bits 520) is <= 1, +// i.e. just a single bit. Record that in "bf", then shift the whole +// scalar left 56 bits to align the top of the next bitfield with the MSB +// (bits 571..575). + + lsr bf, x8, #8 + extr x8, x8, x7, #8 + extr x7, x7, x6, #8 + extr x6, x6, x5, #8 + extr x5, x5, x4, #8 + extr x4, x4, x3, #8 + extr x3, x3, x2, #8 + extr x2, x2, x1, #8 + extr x1, x1, x0, #8 + lsl x0, x0, #56 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + +// According to the top bit, initialize the accumulator to P or 0. This top +// digit, uniquely, is not recoded so there is no sign adjustment to make. +// We only really need to adjust the z coordinate to zero, but do all three. + + add tabup, tab + cmp bf, xzr + + ldp x0, x1, [tabup] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc] + ldp x0, x1, [tabup, #16] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+16] + ldp x0, x1, [tabup, #32] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+32] + ldp x0, x1, [tabup, #48] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+48] + ldp x0, x1, [tabup, #64] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+64] + ldp x0, x1, [tabup, #80] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+80] + ldp x0, x1, [tabup, #96] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+96] + ldp x0, x1, [tabup, #112] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+112] + ldp x0, x1, [tabup, #128] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+128] + ldp x0, x1, [tabup, #144] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+144] + ldp x0, x1, [tabup, #160] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+160] + ldp x0, x1, [tabup, #176] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+176] + ldp x0, x1, [tabup, #192] + csel x0, x0, xzr, ne + csel x1, x1, xzr, ne + stp x0, x1, [acc+192] + ldr x0, [tabup, #208] + csel x0, x0, xzr, ne + str x0, [acc+208] + +// Main loop over size-5 bitfields: double 5 times then add signed digit +// At each stage we shift the scalar left by 5 bits so we can simply pick +// the top 5 bits as the bitfield, saving some fiddle over indexing. + + mov j, #520 + +p521_jscalarmul_alt_mainloop: + sub j, j, #5 + + add x0, acc + add x1, acc + bl p521_jscalarmul_alt_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_alt_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_alt_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_alt_jdouble + + add x0, acc + add x1, acc + bl p521_jscalarmul_alt_jdouble + +// Choose the bitfield and adjust it to sign and magnitude + + ldp x0, x1, [scalarb] + ldp x2, x3, [scalarb+16] + ldp x4, x5, [scalarb+32] + ldp x6, x7, [scalarb+48] + ldr x8, [scalarb+64] + lsr bf, x8, #59 + extr x8, x8, x7, #59 + extr x7, x7, x6, #59 + extr x6, x6, x5, #59 + extr x5, x5, x4, #59 + extr x4, x4, x3, #59 + extr x3, x3, x2, #59 + extr x2, x2, x1, #59 + extr x1, x1, x0, #59 + lsl x0, x0, #5 + stp x0, x1, [scalarb] + stp x2, x3, [scalarb+16] + stp x4, x5, [scalarb+32] + stp x6, x7, [scalarb+48] + str x8, [scalarb+64] + + subs bf, bf, #16 + csetm sgn, lo // sgn = sign of digit (1 = negative) + cneg bf, bf, lo // bf = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + str x8, [tabent+64] + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab+2*NUMSIZE + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + stp x0, x1, [tabent+2*NUMSIZE] + stp x2, x3, [tabent+2*NUMSIZE+16] + stp x4, x5, [tabent+2*NUMSIZE+32] + stp x6, x7, [tabent+2*NUMSIZE+48] + str x8, [tabent+2*NUMSIZE+64] + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + add tabup, tab+NUMSIZE + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + selectblock(9) + selectblock(10) + selectblock(11) + selectblock(12) + selectblock(13) + selectblock(14) + selectblock(15) + selectblock(16) + +// Store it to "tabent" with the y coordinate optionally negated. +// This is done carefully to give coordinates < p_521 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + orr x10, x0, x1 + orr x11, x2, x3 + orr x12, x4, x5 + orr x13, x6, x7 + orr x10, x10, x11 + orr x12, x12, x13 + orr x12, x12, x8 + orr x10, x10, x12 + cmp x10, xzr + csel sgn, sgn, xzr, ne + + eor x0, x0, sgn + eor x1, x1, sgn + eor x2, x2, sgn + eor x3, x3, sgn + eor x4, x4, sgn + eor x5, x5, sgn + eor x6, x6, sgn + eor x7, x7, sgn + and sgn, sgn, #0x1FF + eor x8, x8, sgn + + stp x0, x1, [tabent+NUMSIZE] + stp x2, x3, [tabent+NUMSIZE+16] + stp x4, x5, [tabent+NUMSIZE+32] + stp x6, x7, [tabent+NUMSIZE+48] + str x8, [tabent+NUMSIZE+64] + +// Add to the accumulator + + add x0, acc + add x1, acc + add x2, tabent + bl p521_jscalarmul_alt_jadd + + cbnz j, p521_jscalarmul_alt_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + ldp x0, x1, [acc+96] + stp x0, x1, [res, #96] + ldp x0, x1, [acc+112] + stp x0, x1, [res, #112] + ldp x0, x1, [acc+128] + stp x0, x1, [res, #128] + ldp x0, x1, [acc+144] + stp x0, x1, [res, #144] + ldp x0, x1, [acc+160] + stp x0, x1, [res, #160] + ldp x0, x1, [acc+176] + stp x0, x1, [res, #176] + ldp x0, x1, [acc+192] + stp x0, x1, [res, #192] + ldr x0, [acc+208] + str x0, [res, #208] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment except +// that we share multiplication and squaring between the point operations. + +p521_jscalarmul_alt_bignum_mod_p521_9: + ldr x12, [x1, #64] + lsr x2, x12, #9 + cmp xzr, xzr + ldp x4, x5, [x1] + adcs xzr, x4, x2 + adcs xzr, x5, xzr + ldp x6, x7, [x1, #16] + and x3, x6, x7 + adcs xzr, x3, xzr + ldp x8, x9, [x1, #32] + and x3, x8, x9 + adcs xzr, x3, xzr + ldp x10, x11, [x1, #48] + and x3, x10, x11 + adcs xzr, x3, xzr + orr x3, x12, #0xfffffffffffffe00 + adcs x3, x3, xzr + adcs x4, x4, x2 + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adcs x11, x11, xzr + adc x12, x12, xzr + and x12, x12, #0x1ff + stp x4, x5, [x0] + stp x6, x7, [x0, #16] + stp x8, x9, [x0, #32] + stp x10, x11, [x0, #48] + str x12, [x0, #64] + ret + +p521_jscalarmul_alt_bignum_mod_n521_9: + ldr x14, [x1, #64] + lsr x15, x14, #9 + add x15, x15, #1 + mov x2, #39927 + movk x2, #28359, lsl #16 + movk x2, #18657, lsl #32 + movk x2, #17552, lsl #48 + mul x6, x2, x15 + mov x3, #47185 + movk x3, #30307, lsl #16 + movk x3, #13895, lsl #32 + movk x3, #50250, lsl #48 + mul x7, x3, x15 + mov x4, #23087 + movk x4, #2294, lsl #16 + movk x4, #65207, lsl #32 + movk x4, #32819, lsl #48 + mul x8, x4, x15 + mov x5, #27028 + movk x5, #16592, lsl #16 + movk x5, #30844, lsl #32 + movk x5, #44665, lsl #48 + mul x9, x5, x15 + lsl x10, x15, #2 + add x10, x10, x15 + umulh x13, x2, x15 + adds x7, x7, x13 + umulh x13, x3, x15 + adcs x8, x8, x13 + umulh x13, x4, x15 + adcs x9, x9, x13 + umulh x13, x5, x15 + adc x10, x10, x13 + ldp x12, x13, [x1] + adds x6, x6, x12 + adcs x7, x7, x13 + ldp x12, x13, [x1, #16] + adcs x8, x8, x12 + adcs x9, x9, x13 + ldp x13, x11, [x1, #32] + adcs x10, x10, x13 + adcs x11, x11, xzr + ldp x12, x13, [x1, #48] + adcs x12, x12, xzr + adcs x13, x13, xzr + orr x14, x14, #0xfffffffffffffe00 + adcs x14, x14, xzr + csetm x15, lo + and x2, x2, x15 + subs x6, x6, x2 + and x3, x3, x15 + sbcs x7, x7, x3 + and x4, x4, x15 + sbcs x8, x8, x4 + and x5, x5, x15 + sbcs x9, x9, x5 + mov x2, #5 + and x2, x2, x15 + sbcs x10, x10, x2 + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + sbc x14, x14, xzr + and x14, x14, #0x1ff + stp x6, x7, [x0] + stp x8, x9, [x0, #16] + stp x10, x11, [x0, #32] + stp x12, x13, [x0, #48] + str x14, [x0, #64] + ret + +p521_jscalarmul_alt_jadd: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! + sub sp, sp, #0x240 + mov x27, x0 + mov x28, x1 + mov x29, x2 + mov x0, sp + add x1, x28, #0x90 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0x168 + add x1, x29, #0x90 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0x1f8 + add x1, x29, #0x90 + add x2, x28, #0x48 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x48 + add x1, x28, #0x90 + add x2, x29, #0x48 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x90 + mov x1, sp + add x2, x29, #0x0 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x168 + add x2, x28, #0x0 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x48 + mov x1, sp + add x2, sp, #0x48 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x1f8 + add x1, sp, #0x168 + add x2, sp, #0x1f8 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0x90 + add x2, sp, #0x120 + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0x48 + add x1, sp, #0x48 + add x2, sp, #0x1f8 + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0x168 + bl p521_jscalarmul_alt_sqr_p521 + mov x0, sp + add x1, sp, #0x48 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0x120 + add x1, sp, #0xd8 + add x2, sp, #0x120 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x90 + add x1, sp, #0xd8 + add x2, sp, #0x90 + bl p521_jscalarmul_alt_mul_p521 + mov x0, sp + mov x1, sp + add x2, sp, #0x120 + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0x90 + add x2, sp, #0x120 + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0x168 + add x1, sp, #0x168 + add x2, x28, #0x90 + bl p521_jscalarmul_alt_mul_p521 + mov x0, sp + mov x1, sp + add x2, sp, #0x90 + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0x120 + add x1, sp, #0x120 + mov x2, sp + bl p521_jscalarmul_alt_sub_p521 + add x0, sp, #0xd8 + add x1, sp, #0xd8 + add x2, sp, #0x1f8 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0x168 + add x2, x29, #0x90 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x48 + add x2, sp, #0x120 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x120 + add x1, sp, #0x120 + add x2, sp, #0xd8 + bl p521_jscalarmul_alt_sub_p521 + ldp x0, x1, [x28, #144] + ldp x2, x3, [x28, #160] + ldp x4, x5, [x28, #176] + ldp x6, x7, [x28, #192] + ldr x8, [x28, #208] + orr x20, x0, x1 + orr x21, x2, x3 + orr x22, x4, x5 + orr x23, x6, x7 + orr x20, x20, x21 + orr x22, x22, x23 + orr x20, x20, x8 + orr x20, x20, x22 + cmp x20, xzr + cset x20, ne + ldp x10, x11, [x29, #144] + ldp x12, x13, [x29, #160] + ldp x14, x15, [x29, #176] + ldp x16, x17, [x29, #192] + ldr x19, [x29, #208] + orr x21, x10, x11 + orr x22, x12, x13 + orr x23, x14, x15 + orr x24, x16, x17 + orr x21, x21, x22 + orr x23, x23, x24 + orr x21, x21, x19 + orr x21, x21, x23 + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + cmp x21, xzr + cset x21, ne + cmp x21, x20 + ldp x10, x11, [sp, #360] + ldp x12, x13, [sp, #376] + ldp x14, x15, [sp, #392] + ldp x16, x17, [sp, #408] + ldr x19, [sp, #424] + csel x0, x0, x10, ne + csel x1, x1, x11, ne + csel x2, x2, x12, ne + csel x3, x3, x13, ne + csel x4, x4, x14, ne + csel x5, x5, x15, ne + csel x6, x6, x16, ne + csel x7, x7, x17, ne + csel x8, x8, x19, ne + stp x0, x1, [sp, #360] + stp x2, x3, [sp, #376] + stp x4, x5, [sp, #392] + stp x6, x7, [sp, #408] + str x8, [sp, #424] + ldp x20, x21, [x28] + ldp x0, x1, [sp] + csel x0, x20, x0, cc + csel x1, x21, x1, cc + ldp x20, x21, [x29] + csel x0, x20, x0, hi + csel x1, x21, x1, hi + ldp x20, x21, [x28, #16] + ldp x2, x3, [sp, #16] + csel x2, x20, x2, cc + csel x3, x21, x3, cc + ldp x20, x21, [x29, #16] + csel x2, x20, x2, hi + csel x3, x21, x3, hi + ldp x20, x21, [x28, #32] + ldp x4, x5, [sp, #32] + csel x4, x20, x4, cc + csel x5, x21, x5, cc + ldp x20, x21, [x29, #32] + csel x4, x20, x4, hi + csel x5, x21, x5, hi + ldp x20, x21, [x28, #48] + ldp x6, x7, [sp, #48] + csel x6, x20, x6, cc + csel x7, x21, x7, cc + ldp x20, x21, [x29, #48] + csel x6, x20, x6, hi + csel x7, x21, x7, hi + ldr x20, [x28, #64] + ldr x8, [sp, #64] + csel x8, x20, x8, cc + ldr x21, [x29, #64] + csel x8, x21, x8, hi + ldp x20, x21, [x28, #72] + ldp x10, x11, [sp, #288] + csel x10, x20, x10, cc + csel x11, x21, x11, cc + ldp x20, x21, [x29, #72] + csel x10, x20, x10, hi + csel x11, x21, x11, hi + ldp x20, x21, [x28, #88] + ldp x12, x13, [sp, #304] + csel x12, x20, x12, cc + csel x13, x21, x13, cc + ldp x20, x21, [x29, #88] + csel x12, x20, x12, hi + csel x13, x21, x13, hi + ldp x20, x21, [x28, #104] + ldp x14, x15, [sp, #320] + csel x14, x20, x14, cc + csel x15, x21, x15, cc + ldp x20, x21, [x29, #104] + csel x14, x20, x14, hi + csel x15, x21, x15, hi + ldp x20, x21, [x28, #120] + ldp x16, x17, [sp, #336] + csel x16, x20, x16, cc + csel x17, x21, x17, cc + ldp x20, x21, [x29, #120] + csel x16, x20, x16, hi + csel x17, x21, x17, hi + ldr x20, [x28, #136] + ldr x19, [sp, #352] + csel x19, x20, x19, cc + ldr x21, [x29, #136] + csel x19, x21, x19, hi + stp x0, x1, [x27] + stp x2, x3, [x27, #16] + stp x4, x5, [x27, #32] + stp x6, x7, [x27, #48] + str x8, [x27, #64] + ldp x0, x1, [sp, #360] + ldp x2, x3, [sp, #376] + ldp x4, x5, [sp, #392] + ldp x6, x7, [sp, #408] + ldr x8, [sp, #424] + stp x10, x11, [x27, #72] + stp x12, x13, [x27, #88] + stp x14, x15, [x27, #104] + stp x16, x17, [x27, #120] + str x19, [x27, #136] + stp x0, x1, [x27, #144] + stp x2, x3, [x27, #160] + stp x4, x5, [x27, #176] + stp x6, x7, [x27, #192] + str x8, [x27, #208] + add sp, sp, #0x240 + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_alt_jdouble: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x28, [sp, #-16]! + stp x29, x30, [sp, #-16]! + sub sp, sp, #0x200 + mov x27, x0 + mov x28, x1 + mov x0, sp + add x1, x28, #0x90 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0x48 + add x1, x28, #0x48 + bl p521_jscalarmul_alt_sqr_p521 + ldp x5, x6, [x28] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x28, #16] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x28, #32] + ldp x4, x3, [sp, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x28, #48] + ldp x4, x3, [sp, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x28, #64] + ldr x4, [sp, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [sp, #216] + stp x7, x8, [sp, #232] + stp x9, x10, [sp, #248] + stp x11, x12, [sp, #264] + str x13, [sp, #280] + cmp xzr, xzr + ldp x5, x6, [x28] + ldp x4, x3, [sp] + adcs x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x28, #16] + ldp x4, x3, [sp, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x28, #32] + ldp x4, x3, [sp, #32] + adcs x9, x9, x4 + adcs x10, x10, x3 + ldp x11, x12, [x28, #48] + ldp x4, x3, [sp, #48] + adcs x11, x11, x4 + adcs x12, x12, x3 + ldr x13, [x28, #64] + ldr x4, [sp, #64] + adc x13, x13, x4 + subs x4, x13, #0x200 + csetm x4, cs + sbcs x5, x5, xzr + and x4, x4, #0x200 + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbc x13, x13, x4 + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + add x0, sp, #0xd8 + add x1, sp, #0x90 + add x2, sp, #0xd8 + bl p521_jscalarmul_alt_mul_p521 + cmp xzr, xzr + ldp x5, x6, [x28, #72] + ldp x4, x3, [x28, #144] + adcs x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x28, #88] + ldp x4, x3, [x28, #160] + adcs x7, x7, x4 + adcs x8, x8, x3 + ldp x9, x10, [x28, #104] + ldp x4, x3, [x28, #176] + adcs x9, x9, x4 + adcs x10, x10, x3 + ldp x11, x12, [x28, #120] + ldp x4, x3, [x28, #192] + adcs x11, x11, x4 + adcs x12, x12, x3 + ldr x13, [x28, #136] + ldr x4, [x28, #208] + adc x13, x13, x4 + subs x4, x13, #0x200 + csetm x4, cs + sbcs x5, x5, xzr + and x4, x4, #0x200 + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbc x13, x13, x4 + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + add x0, sp, #0x120 + add x1, x28, #0x0 + add x2, sp, #0x48 + bl p521_jscalarmul_alt_mul_p521 + add x0, sp, #0x168 + add x1, sp, #0xd8 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0x90 + add x1, sp, #0x90 + bl p521_jscalarmul_alt_sqr_p521 + ldp x6, x7, [sp, #288] + mov x1, #0xc + mul x3, x1, x6 + mul x4, x1, x7 + umulh x6, x1, x6 + adds x4, x4, x6 + umulh x7, x1, x7 + ldp x8, x9, [sp, #304] + mul x5, x1, x8 + mul x6, x1, x9 + umulh x8, x1, x8 + adcs x5, x5, x7 + umulh x9, x1, x9 + adcs x6, x6, x8 + ldp x10, x11, [sp, #320] + mul x7, x1, x10 + mul x8, x1, x11 + umulh x10, x1, x10 + adcs x7, x7, x9 + umulh x11, x1, x11 + adcs x8, x8, x10 + ldp x12, x13, [sp, #336] + mul x9, x1, x12 + mul x10, x1, x13 + umulh x12, x1, x12 + adcs x9, x9, x11 + umulh x13, x1, x13 + adcs x10, x10, x12 + ldr x14, [sp, #352] + mul x11, x1, x14 + adc x11, x11, x13 + mov x1, #0x9 + ldp x20, x21, [sp, #360] + mvn x20, x20 + mul x0, x1, x20 + umulh x20, x1, x20 + adds x3, x3, x0 + mvn x21, x21 + mul x0, x1, x21 + umulh x21, x1, x21 + adcs x4, x4, x0 + ldp x22, x23, [sp, #376] + mvn x22, x22 + mul x0, x1, x22 + umulh x22, x1, x22 + adcs x5, x5, x0 + mvn x23, x23 + mul x0, x1, x23 + umulh x23, x1, x23 + adcs x6, x6, x0 + ldp x17, x19, [sp, #392] + mvn x17, x17 + mul x0, x1, x17 + umulh x17, x1, x17 + adcs x7, x7, x0 + mvn x19, x19 + mul x0, x1, x19 + umulh x19, x1, x19 + adcs x8, x8, x0 + ldp x2, x16, [sp, #408] + mvn x2, x2 + mul x0, x1, x2 + umulh x2, x1, x2 + adcs x9, x9, x0 + mvn x16, x16 + mul x0, x1, x16 + umulh x16, x1, x16 + adcs x10, x10, x0 + ldr x0, [sp, #424] + eor x0, x0, #0x1ff + mul x0, x1, x0 + adc x11, x11, x0 + adds x4, x4, x20 + adcs x5, x5, x21 + and x15, x4, x5 + adcs x6, x6, x22 + and x15, x15, x6 + adcs x7, x7, x23 + and x15, x15, x7 + adcs x8, x8, x17 + and x15, x15, x8 + adcs x9, x9, x19 + and x15, x15, x9 + adcs x10, x10, x2 + and x15, x15, x10 + adc x11, x11, x16 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [sp, #360] + stp x5, x6, [sp, #376] + stp x7, x8, [sp, #392] + stp x9, x10, [sp, #408] + str x11, [sp, #424] + ldp x5, x6, [sp, #144] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #160] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #176] + ldp x4, x3, [sp, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [sp, #192] + ldp x4, x3, [sp, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [sp, #208] + ldr x4, [sp, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [sp, #144] + stp x7, x8, [sp, #160] + stp x9, x10, [sp, #176] + stp x11, x12, [sp, #192] + str x13, [sp, #208] + mov x0, sp + add x1, sp, #0x48 + bl p521_jscalarmul_alt_sqr_p521 + add x0, sp, #0xd8 + add x1, sp, #0x168 + add x2, sp, #0xd8 + bl p521_jscalarmul_alt_mul_p521 + ldp x5, x6, [sp, #144] + ldp x4, x3, [sp, #72] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #160] + ldp x4, x3, [sp, #88] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [sp, #176] + ldp x4, x3, [sp, #104] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [sp, #192] + ldp x4, x3, [sp, #120] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [sp, #208] + ldr x4, [sp, #136] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x27, #144] + stp x7, x8, [x27, #160] + stp x9, x10, [x27, #176] + stp x11, x12, [x27, #192] + str x13, [x27, #208] + ldp x6, x7, [sp, #288] + lsl x3, x6, #2 + extr x4, x7, x6, #62 + ldp x8, x9, [sp, #304] + extr x5, x8, x7, #62 + extr x6, x9, x8, #62 + ldp x10, x11, [sp, #320] + extr x7, x10, x9, #62 + extr x8, x11, x10, #62 + ldp x12, x13, [sp, #336] + extr x9, x12, x11, #62 + extr x10, x13, x12, #62 + ldr x14, [sp, #352] + extr x11, x14, x13, #62 + ldp x0, x1, [sp, #360] + mvn x0, x0 + adds x3, x3, x0 + sbcs x4, x4, x1 + ldp x0, x1, [sp, #376] + sbcs x5, x5, x0 + and x15, x4, x5 + sbcs x6, x6, x1 + and x15, x15, x6 + ldp x0, x1, [sp, #392] + sbcs x7, x7, x0 + and x15, x15, x7 + sbcs x8, x8, x1 + and x15, x15, x8 + ldp x0, x1, [sp, #408] + sbcs x9, x9, x0 + and x15, x15, x9 + sbcs x10, x10, x1 + and x15, x15, x10 + ldr x0, [sp, #424] + eor x0, x0, #0x1ff + adc x11, x11, x0 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [x27] + stp x5, x6, [x27, #16] + stp x7, x8, [x27, #32] + stp x9, x10, [x27, #48] + str x11, [x27, #64] + ldp x6, x7, [sp, #216] + lsl x3, x6, #1 + adds x3, x3, x6 + extr x4, x7, x6, #63 + adcs x4, x4, x7 + ldp x8, x9, [sp, #232] + extr x5, x8, x7, #63 + adcs x5, x5, x8 + extr x6, x9, x8, #63 + adcs x6, x6, x9 + ldp x10, x11, [sp, #248] + extr x7, x10, x9, #63 + adcs x7, x7, x10 + extr x8, x11, x10, #63 + adcs x8, x8, x11 + ldp x12, x13, [sp, #264] + extr x9, x12, x11, #63 + adcs x9, x9, x12 + extr x10, x13, x12, #63 + adcs x10, x10, x13 + ldr x14, [sp, #280] + extr x11, x14, x13, #63 + adc x11, x11, x14 + ldp x20, x21, [sp] + mvn x20, x20 + lsl x0, x20, #3 + adds x3, x3, x0 + mvn x21, x21 + extr x0, x21, x20, #61 + adcs x4, x4, x0 + ldp x22, x23, [sp, #16] + mvn x22, x22 + extr x0, x22, x21, #61 + adcs x5, x5, x0 + and x15, x4, x5 + mvn x23, x23 + extr x0, x23, x22, #61 + adcs x6, x6, x0 + and x15, x15, x6 + ldp x20, x21, [sp, #32] + mvn x20, x20 + extr x0, x20, x23, #61 + adcs x7, x7, x0 + and x15, x15, x7 + mvn x21, x21 + extr x0, x21, x20, #61 + adcs x8, x8, x0 + and x15, x15, x8 + ldp x22, x23, [sp, #48] + mvn x22, x22 + extr x0, x22, x21, #61 + adcs x9, x9, x0 + and x15, x15, x9 + mvn x23, x23 + extr x0, x23, x22, #61 + adcs x10, x10, x0 + and x15, x15, x10 + ldr x0, [sp, #64] + eor x0, x0, #0x1ff + extr x0, x0, x23, #61 + adc x11, x11, x0 + lsr x12, x11, #9 + orr x11, x11, #0xfffffffffffffe00 + cmp xzr, xzr + adcs xzr, x3, x12 + adcs xzr, x15, xzr + adcs xzr, x11, xzr + adcs x3, x3, x12 + adcs x4, x4, xzr + adcs x5, x5, xzr + adcs x6, x6, xzr + adcs x7, x7, xzr + adcs x8, x8, xzr + adcs x9, x9, xzr + adcs x10, x10, xzr + adc x11, x11, xzr + and x11, x11, #0x1ff + stp x3, x4, [x27, #72] + stp x5, x6, [x27, #88] + stp x7, x8, [x27, #104] + stp x9, x10, [x27, #120] + str x11, [x27, #136] + add sp, sp, #0x200 + ldp x29, x30, [sp], #16 + ldp x27, x28, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +p521_jscalarmul_alt_mul_p521: + ldp x3, x4, [x1] + ldp x5, x6, [x2] + mul x15, x3, x5 + umulh x16, x3, x5 + mul x14, x3, x6 + umulh x17, x3, x6 + adds x16, x16, x14 + ldp x7, x8, [x2, #16] + mul x14, x3, x7 + umulh x19, x3, x7 + adcs x17, x17, x14 + mul x14, x3, x8 + umulh x20, x3, x8 + adcs x19, x19, x14 + ldp x9, x10, [x2, #32] + mul x14, x3, x9 + umulh x21, x3, x9 + adcs x20, x20, x14 + mul x14, x3, x10 + umulh x22, x3, x10 + adcs x21, x21, x14 + ldp x11, x12, [x2, #48] + mul x14, x3, x11 + umulh x23, x3, x11 + adcs x22, x22, x14 + ldr x13, [x2, #64] + mul x14, x3, x12 + umulh x24, x3, x12 + adcs x23, x23, x14 + mul x14, x3, x13 + umulh x25, x3, x13 + adcs x24, x24, x14 + adc x25, x25, xzr + mul x14, x4, x5 + adds x16, x16, x14 + mul x14, x4, x6 + adcs x17, x17, x14 + mul x14, x4, x7 + adcs x19, x19, x14 + mul x14, x4, x8 + adcs x20, x20, x14 + mul x14, x4, x9 + adcs x21, x21, x14 + mul x14, x4, x10 + adcs x22, x22, x14 + mul x14, x4, x11 + adcs x23, x23, x14 + mul x14, x4, x12 + adcs x24, x24, x14 + mul x14, x4, x13 + adcs x25, x25, x14 + cset x26, cs + umulh x14, x4, x5 + adds x17, x17, x14 + umulh x14, x4, x6 + adcs x19, x19, x14 + umulh x14, x4, x7 + adcs x20, x20, x14 + umulh x14, x4, x8 + adcs x21, x21, x14 + umulh x14, x4, x9 + adcs x22, x22, x14 + umulh x14, x4, x10 + adcs x23, x23, x14 + umulh x14, x4, x11 + adcs x24, x24, x14 + umulh x14, x4, x12 + adcs x25, x25, x14 + umulh x14, x4, x13 + adc x26, x26, x14 + stp x15, x16, [sp, #432] + ldp x3, x4, [x1, #16] + mul x14, x3, x5 + adds x17, x17, x14 + mul x14, x3, x6 + adcs x19, x19, x14 + mul x14, x3, x7 + adcs x20, x20, x14 + mul x14, x3, x8 + adcs x21, x21, x14 + mul x14, x3, x9 + adcs x22, x22, x14 + mul x14, x3, x10 + adcs x23, x23, x14 + mul x14, x3, x11 + adcs x24, x24, x14 + mul x14, x3, x12 + adcs x25, x25, x14 + mul x14, x3, x13 + adcs x26, x26, x14 + cset x15, cs + umulh x14, x3, x5 + adds x19, x19, x14 + umulh x14, x3, x6 + adcs x20, x20, x14 + umulh x14, x3, x7 + adcs x21, x21, x14 + umulh x14, x3, x8 + adcs x22, x22, x14 + umulh x14, x3, x9 + adcs x23, x23, x14 + umulh x14, x3, x10 + adcs x24, x24, x14 + umulh x14, x3, x11 + adcs x25, x25, x14 + umulh x14, x3, x12 + adcs x26, x26, x14 + umulh x14, x3, x13 + adc x15, x15, x14 + mul x14, x4, x5 + adds x19, x19, x14 + mul x14, x4, x6 + adcs x20, x20, x14 + mul x14, x4, x7 + adcs x21, x21, x14 + mul x14, x4, x8 + adcs x22, x22, x14 + mul x14, x4, x9 + adcs x23, x23, x14 + mul x14, x4, x10 + adcs x24, x24, x14 + mul x14, x4, x11 + adcs x25, x25, x14 + mul x14, x4, x12 + adcs x26, x26, x14 + mul x14, x4, x13 + adcs x15, x15, x14 + cset x16, cs + umulh x14, x4, x5 + adds x20, x20, x14 + umulh x14, x4, x6 + adcs x21, x21, x14 + umulh x14, x4, x7 + adcs x22, x22, x14 + umulh x14, x4, x8 + adcs x23, x23, x14 + umulh x14, x4, x9 + adcs x24, x24, x14 + umulh x14, x4, x10 + adcs x25, x25, x14 + umulh x14, x4, x11 + adcs x26, x26, x14 + umulh x14, x4, x12 + adcs x15, x15, x14 + umulh x14, x4, x13 + adc x16, x16, x14 + stp x17, x19, [sp, #448] + ldp x3, x4, [x1, #32] + mul x14, x3, x5 + adds x20, x20, x14 + mul x14, x3, x6 + adcs x21, x21, x14 + mul x14, x3, x7 + adcs x22, x22, x14 + mul x14, x3, x8 + adcs x23, x23, x14 + mul x14, x3, x9 + adcs x24, x24, x14 + mul x14, x3, x10 + adcs x25, x25, x14 + mul x14, x3, x11 + adcs x26, x26, x14 + mul x14, x3, x12 + adcs x15, x15, x14 + mul x14, x3, x13 + adcs x16, x16, x14 + cset x17, cs + umulh x14, x3, x5 + adds x21, x21, x14 + umulh x14, x3, x6 + adcs x22, x22, x14 + umulh x14, x3, x7 + adcs x23, x23, x14 + umulh x14, x3, x8 + adcs x24, x24, x14 + umulh x14, x3, x9 + adcs x25, x25, x14 + umulh x14, x3, x10 + adcs x26, x26, x14 + umulh x14, x3, x11 + adcs x15, x15, x14 + umulh x14, x3, x12 + adcs x16, x16, x14 + umulh x14, x3, x13 + adc x17, x17, x14 + mul x14, x4, x5 + adds x21, x21, x14 + mul x14, x4, x6 + adcs x22, x22, x14 + mul x14, x4, x7 + adcs x23, x23, x14 + mul x14, x4, x8 + adcs x24, x24, x14 + mul x14, x4, x9 + adcs x25, x25, x14 + mul x14, x4, x10 + adcs x26, x26, x14 + mul x14, x4, x11 + adcs x15, x15, x14 + mul x14, x4, x12 + adcs x16, x16, x14 + mul x14, x4, x13 + adcs x17, x17, x14 + cset x19, cs + umulh x14, x4, x5 + adds x22, x22, x14 + umulh x14, x4, x6 + adcs x23, x23, x14 + umulh x14, x4, x7 + adcs x24, x24, x14 + umulh x14, x4, x8 + adcs x25, x25, x14 + umulh x14, x4, x9 + adcs x26, x26, x14 + umulh x14, x4, x10 + adcs x15, x15, x14 + umulh x14, x4, x11 + adcs x16, x16, x14 + umulh x14, x4, x12 + adcs x17, x17, x14 + umulh x14, x4, x13 + adc x19, x19, x14 + stp x20, x21, [sp, #464] + ldp x3, x4, [x1, #48] + mul x14, x3, x5 + adds x22, x22, x14 + mul x14, x3, x6 + adcs x23, x23, x14 + mul x14, x3, x7 + adcs x24, x24, x14 + mul x14, x3, x8 + adcs x25, x25, x14 + mul x14, x3, x9 + adcs x26, x26, x14 + mul x14, x3, x10 + adcs x15, x15, x14 + mul x14, x3, x11 + adcs x16, x16, x14 + mul x14, x3, x12 + adcs x17, x17, x14 + mul x14, x3, x13 + adcs x19, x19, x14 + cset x20, cs + umulh x14, x3, x5 + adds x23, x23, x14 + umulh x14, x3, x6 + adcs x24, x24, x14 + umulh x14, x3, x7 + adcs x25, x25, x14 + umulh x14, x3, x8 + adcs x26, x26, x14 + umulh x14, x3, x9 + adcs x15, x15, x14 + umulh x14, x3, x10 + adcs x16, x16, x14 + umulh x14, x3, x11 + adcs x17, x17, x14 + umulh x14, x3, x12 + adcs x19, x19, x14 + umulh x14, x3, x13 + adc x20, x20, x14 + mul x14, x4, x5 + adds x23, x23, x14 + mul x14, x4, x6 + adcs x24, x24, x14 + mul x14, x4, x7 + adcs x25, x25, x14 + mul x14, x4, x8 + adcs x26, x26, x14 + mul x14, x4, x9 + adcs x15, x15, x14 + mul x14, x4, x10 + adcs x16, x16, x14 + mul x14, x4, x11 + adcs x17, x17, x14 + mul x14, x4, x12 + adcs x19, x19, x14 + mul x14, x4, x13 + adcs x20, x20, x14 + cset x21, cs + umulh x14, x4, x5 + adds x24, x24, x14 + umulh x14, x4, x6 + adcs x25, x25, x14 + umulh x14, x4, x7 + adcs x26, x26, x14 + umulh x14, x4, x8 + adcs x15, x15, x14 + umulh x14, x4, x9 + adcs x16, x16, x14 + umulh x14, x4, x10 + adcs x17, x17, x14 + umulh x14, x4, x11 + adcs x19, x19, x14 + umulh x14, x4, x12 + adcs x20, x20, x14 + umulh x14, x4, x13 + adc x21, x21, x14 + stp x22, x23, [sp, #480] + ldr x3, [x1, #64] + mul x14, x3, x5 + adds x24, x24, x14 + mul x14, x3, x6 + adcs x25, x25, x14 + mul x14, x3, x7 + adcs x26, x26, x14 + mul x14, x3, x8 + adcs x15, x15, x14 + mul x14, x3, x9 + adcs x16, x16, x14 + mul x14, x3, x10 + adcs x17, x17, x14 + mul x14, x3, x11 + adcs x19, x19, x14 + mul x14, x3, x12 + adcs x20, x20, x14 + mul x14, x3, x13 + adc x21, x21, x14 + umulh x14, x3, x5 + adds x25, x25, x14 + umulh x14, x3, x6 + adcs x26, x26, x14 + umulh x14, x3, x7 + adcs x15, x15, x14 + umulh x14, x3, x8 + adcs x16, x16, x14 + umulh x14, x3, x9 + adcs x17, x17, x14 + umulh x14, x3, x10 + adcs x19, x19, x14 + umulh x14, x3, x11 + adcs x20, x20, x14 + umulh x14, x3, x12 + adc x21, x21, x14 + cmp xzr, xzr + ldp x5, x6, [sp, #432] + extr x14, x25, x24, #9 + adcs x5, x5, x14 + extr x14, x26, x25, #9 + adcs x6, x6, x14 + ldp x7, x8, [sp, #448] + extr x14, x15, x26, #9 + adcs x7, x7, x14 + extr x14, x16, x15, #9 + adcs x8, x8, x14 + ldp x9, x10, [sp, #464] + extr x14, x17, x16, #9 + adcs x9, x9, x14 + extr x14, x19, x17, #9 + adcs x10, x10, x14 + ldp x11, x12, [sp, #480] + extr x14, x20, x19, #9 + adcs x11, x11, x14 + extr x14, x21, x20, #9 + adcs x12, x12, x14 + orr x13, x24, #0xfffffffffffffe00 + lsr x14, x21, #9 + adcs x13, x13, x14 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbc x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] + ret + +p521_jscalarmul_alt_sqr_p521: + ldp x2, x3, [x1] + mul x11, x2, x3 + umulh x12, x2, x3 + ldp x4, x5, [x1, #16] + mul x10, x2, x4 + umulh x13, x2, x4 + adds x12, x12, x10 + ldp x6, x7, [x1, #32] + mul x10, x2, x5 + umulh x14, x2, x5 + adcs x13, x13, x10 + ldp x8, x9, [x1, #48] + mul x10, x2, x6 + umulh x15, x2, x6 + adcs x14, x14, x10 + mul x10, x2, x7 + umulh x16, x2, x7 + adcs x15, x15, x10 + mul x10, x2, x8 + umulh x17, x2, x8 + adcs x16, x16, x10 + mul x10, x2, x9 + umulh x19, x2, x9 + adcs x17, x17, x10 + adc x19, x19, xzr + mul x10, x3, x4 + adds x13, x13, x10 + mul x10, x3, x5 + adcs x14, x14, x10 + mul x10, x3, x6 + adcs x15, x15, x10 + mul x10, x3, x7 + adcs x16, x16, x10 + mul x10, x3, x8 + adcs x17, x17, x10 + mul x10, x3, x9 + adcs x19, x19, x10 + cset x20, cs + umulh x10, x3, x4 + adds x14, x14, x10 + umulh x10, x3, x5 + adcs x15, x15, x10 + umulh x10, x3, x6 + adcs x16, x16, x10 + umulh x10, x3, x7 + adcs x17, x17, x10 + umulh x10, x3, x8 + adcs x19, x19, x10 + umulh x10, x3, x9 + adc x20, x20, x10 + mul x10, x6, x7 + umulh x21, x6, x7 + adds x20, x20, x10 + adc x21, x21, xzr + mul x10, x4, x5 + adds x15, x15, x10 + mul x10, x4, x6 + adcs x16, x16, x10 + mul x10, x4, x7 + adcs x17, x17, x10 + mul x10, x4, x8 + adcs x19, x19, x10 + mul x10, x4, x9 + adcs x20, x20, x10 + mul x10, x6, x8 + adcs x21, x21, x10 + cset x22, cs + umulh x10, x4, x5 + adds x16, x16, x10 + umulh x10, x4, x6 + adcs x17, x17, x10 + umulh x10, x4, x7 + adcs x19, x19, x10 + umulh x10, x4, x8 + adcs x20, x20, x10 + umulh x10, x4, x9 + adcs x21, x21, x10 + umulh x10, x6, x8 + adc x22, x22, x10 + mul x10, x7, x8 + umulh x23, x7, x8 + adds x22, x22, x10 + adc x23, x23, xzr + mul x10, x5, x6 + adds x17, x17, x10 + mul x10, x5, x7 + adcs x19, x19, x10 + mul x10, x5, x8 + adcs x20, x20, x10 + mul x10, x5, x9 + adcs x21, x21, x10 + mul x10, x6, x9 + adcs x22, x22, x10 + mul x10, x7, x9 + adcs x23, x23, x10 + cset x24, cs + umulh x10, x5, x6 + adds x19, x19, x10 + umulh x10, x5, x7 + adcs x20, x20, x10 + umulh x10, x5, x8 + adcs x21, x21, x10 + umulh x10, x5, x9 + adcs x22, x22, x10 + umulh x10, x6, x9 + adcs x23, x23, x10 + umulh x10, x7, x9 + adc x24, x24, x10 + mul x10, x8, x9 + umulh x25, x8, x9 + adds x24, x24, x10 + adc x25, x25, xzr + adds x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + adcs x15, x15, x15 + adcs x16, x16, x16 + adcs x17, x17, x17 + adcs x19, x19, x19 + adcs x20, x20, x20 + adcs x21, x21, x21 + adcs x22, x22, x22 + adcs x23, x23, x23 + adcs x24, x24, x24 + adcs x25, x25, x25 + cset x26, cs + umulh x10, x2, x2 + adds x11, x11, x10 + mul x10, x3, x3 + adcs x12, x12, x10 + umulh x10, x3, x3 + adcs x13, x13, x10 + mul x10, x4, x4 + adcs x14, x14, x10 + umulh x10, x4, x4 + adcs x15, x15, x10 + mul x10, x5, x5 + adcs x16, x16, x10 + umulh x10, x5, x5 + adcs x17, x17, x10 + mul x10, x6, x6 + adcs x19, x19, x10 + umulh x10, x6, x6 + adcs x20, x20, x10 + mul x10, x7, x7 + adcs x21, x21, x10 + umulh x10, x7, x7 + adcs x22, x22, x10 + mul x10, x8, x8 + adcs x23, x23, x10 + umulh x10, x8, x8 + adcs x24, x24, x10 + mul x10, x9, x9 + adcs x25, x25, x10 + umulh x10, x9, x9 + adc x26, x26, x10 + ldr x1, [x1, #64] + add x1, x1, x1 + mul x10, x1, x2 + adds x19, x19, x10 + umulh x10, x1, x2 + adcs x20, x20, x10 + mul x10, x1, x4 + adcs x21, x21, x10 + umulh x10, x1, x4 + adcs x22, x22, x10 + mul x10, x1, x6 + adcs x23, x23, x10 + umulh x10, x1, x6 + adcs x24, x24, x10 + mul x10, x1, x8 + adcs x25, x25, x10 + umulh x10, x1, x8 + adcs x26, x26, x10 + lsr x4, x1, #1 + mul x4, x4, x4 + adc x4, x4, xzr + mul x10, x1, x3 + adds x20, x20, x10 + umulh x10, x1, x3 + adcs x21, x21, x10 + mul x10, x1, x5 + adcs x22, x22, x10 + umulh x10, x1, x5 + adcs x23, x23, x10 + mul x10, x1, x7 + adcs x24, x24, x10 + umulh x10, x1, x7 + adcs x25, x25, x10 + mul x10, x1, x9 + adcs x26, x26, x10 + umulh x10, x1, x9 + adc x4, x4, x10 + mul x2, x2, x2 + cmp xzr, xzr + extr x10, x20, x19, #9 + adcs x2, x2, x10 + extr x10, x21, x20, #9 + adcs x11, x11, x10 + extr x10, x22, x21, #9 + adcs x12, x12, x10 + extr x10, x23, x22, #9 + adcs x13, x13, x10 + extr x10, x24, x23, #9 + adcs x14, x14, x10 + extr x10, x25, x24, #9 + adcs x15, x15, x10 + extr x10, x26, x25, #9 + adcs x16, x16, x10 + extr x10, x4, x26, #9 + adcs x17, x17, x10 + orr x19, x19, #0xfffffffffffffe00 + lsr x10, x4, #9 + adcs x19, x19, x10 + sbcs x2, x2, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + sbcs x14, x14, xzr + sbcs x15, x15, xzr + sbcs x16, x16, xzr + sbcs x17, x17, xzr + sbc x19, x19, xzr + and x19, x19, #0x1ff + stp x2, x11, [x0] + stp x12, x13, [x0, #16] + stp x14, x15, [x0, #32] + stp x16, x17, [x0, #48] + str x19, [x0, #64] + ret + +p521_jscalarmul_alt_sub_p521: + ldp x5, x6, [x1] + ldp x4, x3, [x2] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [x2, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + ldp x9, x10, [x1, #32] + ldp x4, x3, [x2, #32] + sbcs x9, x9, x4 + sbcs x10, x10, x3 + ldp x11, x12, [x1, #48] + ldp x4, x3, [x2, #48] + sbcs x11, x11, x4 + sbcs x12, x12, x3 + ldr x13, [x1, #64] + ldr x4, [x2, #64] + sbcs x13, x13, x4 + sbcs x5, x5, xzr + sbcs x6, x6, xzr + sbcs x7, x7, xzr + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbcs x10, x10, xzr + sbcs x11, x11, xzr + sbcs x12, x12, xzr + sbcs x13, x13, xzr + and x13, x13, #0x1ff + stp x5, x6, [x0] + stp x7, x8, [x0, #16] + stp x9, x10, [x0, #32] + stp x11, x12, [x0, #48] + str x13, [x0, #64] + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S b/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S new file mode 100644 index 0000000000..2908ac6c78 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/bignum_inv_p521.S @@ -0,0 +1,2087 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Modular inverse modulo p_521 = 2^521 - 1 +// Input x[9]; output z[9] +// +// extern void bignum_inv_p521(uint64_t z[static 9],uint64_t x[static 9]); +// +// Assuming the 9-digit input x is coprime to p_521, i.e. is not divisible +// by it, returns z < p_521 such that x * z == 1 (mod p_521). Note that +// x does not need to be reduced modulo p_521, but the output always is. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_inv_p521) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_inv_p521) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (9*N)(%rsp) +#define u (18*N)(%rsp) +#define v (27*N)(%rsp) +#define tmp (36*N)(%rsp) +#define tmp2 (37*N)(%rsp) +#define i (38*N)(%rsp) +#define d (39*N)(%rsp) + +#define mat (40*N)(%rsp) + +// Backup for the input pointer + +#define res (44*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (45*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (9*N) +#define U (18*N) +#define V (27*N) +#define MAT (40*N) + +#define ff (%rsp) +#define gg (9*N)(%rsp) + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_inv_p521): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Copy the prime p_521 = 2^521 - 1 into the f variable + + xorl %eax, %eax + notq %rax + movq %rax, F(%rsp) + movq %rax, F+8(%rsp) + movq %rax, F+16(%rsp) + movq %rax, F+24(%rsp) + movq %rax, F+32(%rsp) + movq %rax, F+40(%rsp) + movq %rax, F+48(%rsp) + movq %rax, F+56(%rsp) + movl $0x1FF, %eax + movq %rax, F+64(%rsp) + +// Copy the input into the g variable, but reduce it strictly mod p_521 +// so that g <= f as assumed in the bound proof. This code fragment is +// very similar to bignum_mod_p521_9. + + movq 64(%rsi), %r8 + movl $0x1FF, %ebx + andq %r8, %rbx + shrq $9, %r8 + + stc + adcq (%rsi), %r8 + movq 8(%rsi), %r9 + adcq $0, %r9 + movq 16(%rsi), %r10 + adcq $0, %r10 + movq 24(%rsi), %r11 + adcq $0, %r11 + movq 32(%rsi), %r12 + adcq $0, %r12 + movq 40(%rsi), %r13 + adcq $0, %r13 + movq 48(%rsi), %r14 + adcq $0, %r14 + movq 56(%rsi), %r15 + adcq $0, %r15 + adcq $0, %rbx + + cmpq $512, %rbx + + sbbq $0, %r8 + movq %r8, G(%rsp) + sbbq $0, %r9 + movq %r9, G+8(%rsp) + sbbq $0, %r10 + movq %r10, G+16(%rsp) + sbbq $0, %r11 + movq %r11, G+24(%rsp) + sbbq $0, %r12 + movq %r12, G+32(%rsp) + sbbq $0, %r13 + movq %r13, G+40(%rsp) + sbbq $0, %r14 + movq %r14, G+48(%rsp) + sbbq $0, %r15 + movq %r15, G+56(%rsp) + sbbq $0, %rbx + andq $0x1FF, %rbx + movq %rbx, G+64(%rsp) + +// Also maintain weakly reduced < 2*p_521 vector [u,v] such that +// [f,g] == x * 2^{1239-59*i} * [u,v] (mod p_521) +// starting with [p_521,x] == x * 2^{1239-59*0} * [0,2^-1239] (mod p_521) +// Note that because (2^{a+521} == 2^a) (mod p_521) we simply have +// (2^-1239 == 2^324) (mod p_521) so the constant initializer is simple. +// +// Based on the standard divstep bound, for inputs <= 2^b we need at least +// n >= (9437 * b + 1) / 4096. Since b is 521, that means 1201 iterations. +// Since we package divstep in multiples of 59 bits, we do 21 blocks of 59 +// making *1239* total. (With a bit more effort we could avoid the full 59 +// divsteps and use a shorter tail computation, but we keep it simple.) +// Hence, after the 21st iteration we have [f,g] == x * [u,v] and since +// |f| = 1 we get the modular inverse from u by flipping its sign with f. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + movq %rax, U+32(%rsp) + movq %rax, U+40(%rsp) + movq %rax, U+48(%rsp) + movq %rax, U+56(%rsp) + movq %rax, U+64(%rsp) + + movl $16, %ebx + movq %rax, V(%rsp) + movq %rax, V+8(%rsp) + movq %rax, V+16(%rsp) + movq %rax, V+24(%rsp) + movq %rax, V+32(%rsp) + movq %rbx, V+40(%rsp) + movq %rax, V+48(%rsp) + movq %rax, V+56(%rsp) + movq %rax, V+64(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special 21st iteration after a uniform +// first 20. + + movq $21, i + movq $1, d + jmp midloop + +loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digit 3 of [f,g] + + xorl %ebp, %ebp + movq F+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + + xorl %ecx, %ecx + movq F+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + +// Digit 4 of [f,g] + + xorl %edi, %edi + movq F+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, F+3*N(%rsp) + + xorl %esi, %esi + movq F+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, G+3*N(%rsp) + +// Digit 5 of [f,g] + + xorl %ebx, %ebx + movq F+5*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+5*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, F+4*N(%rsp) + + xorl %ebp, %ebp + movq F+5*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+5*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, G+4*N(%rsp) + +// Digit 6 of [f,g] + + xorl %ecx, %ecx + movq F+6*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+6*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F+5*N(%rsp) + + xorl %edi, %edi + movq F+6*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+6*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G+5*N(%rsp) + +// Digit 7 of [f,g] + + xorl %esi, %esi + movq F+7*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+7*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+6*N(%rsp) + + xorl %ebx, %ebx + movq F+7*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+7*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+6*N(%rsp) + +// Digits 8 and 9 of [f,g] + + movq F+8*N(%rsp), %rax + xorq %r9, %rax + movq %rax, %rbp + sarq $63, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+8*N(%rsp), %rax + xorq %r11, %rax + movq %rax, %rdx + sarq $63, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+7*N(%rsp) + shrdq $59, %rbp, %rsi + + movq F+8*N(%rsp), %rax + movq %rsi, F+8*N(%rsp) + + xorq %r13, %rax + movq %rax, %rsi + sarq $63, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+8*N(%rsp), %rax + xorq %r15, %rax + movq %rax, %rdx + sarq $63, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+7*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+8*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digit 3 of [u,v] + + xorl %ebx, %ebx + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+3*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+3*N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+3*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+3*N(%rsp) + +// Digit 4 of [u,v] + + xorl %ecx, %ecx + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+4*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+4*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+4*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+4*N(%rsp) + +// Digit 5 of [u,v] + + xorl %ebx, %ebx + movq U+5*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+5*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+5*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+5*N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+5*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+5*N(%rsp) + +// Digit 6 of [u,v] + + xorl %ecx, %ecx + movq U+6*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+6*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+6*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+6*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+6*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+6*N(%rsp) + +// Digit 7 of [u,v] + + xorl %ebx, %ebx + movq U+7*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+7*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+7*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+7*N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+7*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+7*N(%rsp) + +// Digits 8 and 9 of u (top is unsigned) + + movq U+8*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rcx + andq %r8, %rcx + negq %rcx + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+8*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rcx + mulq %r10 + addq %rax, %rbx + adcq %rcx, %rdx + +// Modular reduction of u + + movq %rdx, %rax + shldq $55, %rbx, %rdx + sarq $63, %rax + addq %rax, %rdx + movq %rdx, %rax + shlq $9, %rdx + subq %rdx, %rbx + movq %rax, %rdx + sarq $63, %rax + movq U(%rsp), %rcx + addq %rdx, %rcx + movq %rcx, U(%rsp) + movq U+N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+N(%rsp) + movq U+2*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+2*N(%rsp) + movq U+3*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+3*N(%rsp) + movq U+4*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+4*N(%rsp) + movq U+5*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+5*N(%rsp) + movq U+6*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+6*N(%rsp) + movq U+7*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+7*N(%rsp) + adcq %rax, %rbx + +// Preload for last use of old u digit 8 + + movq U+8*N(%rsp), %rax + movq %rbx, U+8*N(%rsp) + +// Digits 8 and 9 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rbx + andq %r12, %rbx + negq %rbx + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rbx + movq V+8*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rbx + mulq %r14 + addq %rax, %rbp + adcq %rbx, %rdx + +// Modular reduction of v + + movq %rdx, %rax + shldq $55, %rbp, %rdx + sarq $63, %rax + addq %rax, %rdx + movq %rdx, %rax + shlq $9, %rdx + subq %rdx, %rbp + movq %rax, %rdx + sarq $63, %rax + movq V(%rsp), %rcx + addq %rdx, %rcx + movq %rcx, V(%rsp) + movq V+N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+N(%rsp) + movq V+2*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+2*N(%rsp) + movq V+3*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+3*N(%rsp) + movq V+4*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+4*N(%rsp) + movq V+5*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+5*N(%rsp) + movq V+6*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+6*N(%rsp) + movq V+7*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, V+7*N(%rsp) + adcq %rax, %rbp + movq %rbp, V+8*N(%rsp) + +midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz loop + +// The 21st and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * [u,v] (mod p_521) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rbx + andq %r11, %rbx + addq %rax, %rbx + +// Digit 0 of u + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + movq %rbx, U(%rsp) + adcq %rdx, %rcx + +// Digit 1 of u + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + movq %rcx, U+N(%rsp) + adcq %rdx, %rbx + +// Digit 2 of u + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + movq %rbx, U+2*N(%rsp) + adcq %rdx, %rcx + +// Digit 3 of u + + xorl %ebx, %ebx + movq U+3*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + movq %rcx, U+3*N(%rsp) + adcq %rdx, %rbx + +// Digit 4 of u + + xorl %ecx, %ecx + movq U+4*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+4*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + movq %rbx, U+4*N(%rsp) + adcq %rdx, %rcx + +// Digit 5 of u + + xorl %ebx, %ebx + movq U+5*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+5*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + movq %rcx, U+5*N(%rsp) + adcq %rdx, %rbx + +// Digit 6 of u + + xorl %ecx, %ecx + movq U+6*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+6*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + movq %rbx, U+6*N(%rsp) + adcq %rdx, %rcx + +// Digit 7 of u + + xorl %ebx, %ebx + movq U+7*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+7*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + movq %rcx, U+7*N(%rsp) + adcq %rdx, %rbx + +// Digits 8 and 9 of u (top is unsigned) + + movq U+8*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rcx + andq %r8, %rcx + negq %rcx + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+8*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rcx + mulq %r10 + addq %rax, %rbx + adcq %rcx, %rdx + +// Modular reduction of u + + movq %rdx, %rax + shldq $55, %rbx, %rdx + sarq $63, %rax + addq %rax, %rdx + movq %rdx, %rax + shlq $9, %rdx + subq %rdx, %rbx + movq %rax, %rdx + sarq $63, %rax + movq U(%rsp), %rcx + addq %rdx, %rcx + movq %rcx, U(%rsp) + movq U+N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+N(%rsp) + movq U+2*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+2*N(%rsp) + movq U+3*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+3*N(%rsp) + movq U+4*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+4*N(%rsp) + movq U+5*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+5*N(%rsp) + movq U+6*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+6*N(%rsp) + movq U+7*N(%rsp), %rcx + adcq %rax, %rcx + movq %rcx, U+7*N(%rsp) + adcq %rax, %rbx + movq %rbx, U+8*N(%rsp) + +// Further strict reduction ready for the output, which just means +// a conditional subtraction of p_521 + + xorl %eax, %eax + notq %rax + movq U(%rsp), %r8 + subq %rax, %r8 + movq U+N(%rsp), %r9 + sbbq %rax, %r9 + movq U+2*N(%rsp), %r10 + sbbq %rax, %r10 + movq U+3*N(%rsp), %r11 + sbbq %rax, %r11 + movq U+4*N(%rsp), %r12 + sbbq %rax, %r12 + movq U+5*N(%rsp), %r13 + sbbq %rax, %r13 + movq U+6*N(%rsp), %r14 + sbbq %rax, %r14 + movq U+7*N(%rsp), %r15 + sbbq %rax, %r15 + movl $0x1FF, %eax + movq U+8*N(%rsp), %rbp + sbbq %rax, %rbp + + cmovcq U(%rsp), %r8 + cmovcq U+N(%rsp), %r9 + cmovcq U+2*N(%rsp), %r10 + cmovcq U+3*N(%rsp), %r11 + cmovcq U+4*N(%rsp), %r12 + cmovcq U+5*N(%rsp), %r13 + cmovcq U+6*N(%rsp), %r14 + cmovcq U+7*N(%rsp), %r15 + cmovcq U+8*N(%rsp), %rbp + +// Store it back to the final output + + movq res, %rdi + movq %r8, (%rdi) + movq %r9, N(%rdi) + movq %r10, 2*N(%rdi) + movq %r11, 3*N(%rdi) + movq %r12, 4*N(%rdi) + movq %r13, 5*N(%rdi) + movq %r14, 6*N(%rdi) + movq %r15, 7*N(%rdi) + movq %rbp, 8*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S b/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S new file mode 100644 index 0000000000..905c32a76d --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul.S @@ -0,0 +1,2460 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Jacobian form scalar multiplication for P-521 +// Input scalar[9], point[27]; output res[27] +// +// extern void p521_jscalarmul +// (uint64_t res[static 27], +// uint64_t scalar[static 9], +// uint64_t point[static 27]); +// +// This function is a variant of its affine point version p521_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// a triple (x,y,z) representing the affine point (x/z^2,y/z^3) when +// z is nonzero or the point at infinity (group identity) if z = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-521, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_521) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 +#define JACSIZE (3*NUMSIZE) + +// Intermediate variables on the stack. +// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE +// Uppercase syntactic variants make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (55*NUMSIZE)(%rsp) + +#define NSPACE (56*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I,C) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)+C*NUMSIZE(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+8+C*NUMSIZE(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+16+C*NUMSIZE(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+24+C*NUMSIZE(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+32+C*NUMSIZE(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+40+C*NUMSIZE(%rsp), %r9 ; \ + cmovzq TAB+JACSIZE*(I-1)+48+C*NUMSIZE(%rsp), %r10 ; \ + cmovzq TAB+JACSIZE*(I-1)+56+C*NUMSIZE(%rsp), %r11 ; \ + cmovzq TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12 + +S2N_BN_SYMBOL(p521_jscalarmul): + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p521_jscalarmul_standard + popq %rsi + popq %rdi + ret + +p521_jscalarmul_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" input argument; others get processed early. + + movq %rdi, res + +// Reduce the input scalar mod n_521 and store it to "scalarb". + + movq %rdx, %rbx + leaq SCALARB(%rsp), %rdi + callq p521_jscalarmul_bignum_mod_n521_9 + +// Set the tab[0] table entry to the input point = 1 * P, but also +// reduce all coordinates modulo p. In principle we assume reduction +// as a precondition, but this reduces the scope for surprise, e.g. +// making sure that any input with z = 0 is treated as zero, even +// if the other coordinates are not in fact reduced. + + leaq TAB(%rsp), %rdi + movq %rbx, %rsi + callq p521_jscalarmul_bignum_mod_p521_9 + + leaq TAB+NUMSIZE(%rsp), %rdi + leaq NUMSIZE(%rbx), %rsi + callq p521_jscalarmul_bignum_mod_p521_9 + + leaq TAB+2*NUMSIZE(%rsp), %rdi + leaq 2*NUMSIZE(%rbx), %rsi + callq p521_jscalarmul_bignum_mod_p521_9 + +// If bit 520 of the scalar is set, then negate the scalar mod n_521, +// i.e. do scalar |-> n_521 - scalar, and also the point to compensate +// by negating its y coordinate. This further step is not needed by +// the indexing scheme (the top window is only a couple of bits either +// way), but is convenient to exclude a problem with the specific value +// scalar = n_521 - 18, where the last Jacobian addition is of the form +// (n_521 - 9) * P + -(9 * P) and hence is a degenerate doubling case. + + xorl %eax, %eax + notq %rax + movq $0xbb6fb71e91386409, %r8 + subq SCALARB(%rsp), %r8 + movq $0x3bb5c9b8899c47ae, %r9 + sbbq SCALARB+8(%rsp), %r9 + movq $0x7fcc0148f709a5d0, %r10 + sbbq SCALARB+16(%rsp), %r10 + movq $0x51868783bf2f966b, %r11 + sbbq SCALARB+24(%rsp), %r11 + leaq -5(%rax), %r12 + sbbq SCALARB+32(%rsp), %r12 + movq %rax, %r13 + sbbq SCALARB+40(%rsp), %r13 + movq %rax, %r14 + sbbq SCALARB+48(%rsp), %r14 + movq %rax, %r15 + sbbq SCALARB+56(%rsp), %r15 + movq $0x1ff, %rax + movq SCALARB+64(%rsp), %rcx + sbbq %rcx, %rax + + btq $8, %rcx + sbbq %rcx, %rcx + + cmovncq SCALARB(%rsp), %r8 + cmovncq SCALARB+8(%rsp), %r9 + cmovncq SCALARB+16(%rsp), %r10 + cmovncq SCALARB+24(%rsp), %r11 + cmovncq SCALARB+32(%rsp), %r12 + cmovncq SCALARB+40(%rsp), %r13 + cmovncq SCALARB+48(%rsp), %r14 + cmovncq SCALARB+56(%rsp), %r15 + cmovncq SCALARB+64(%rsp), %rax + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + + movq TAB+NUMSIZE(%rsp), %r8 + movq TAB+NUMSIZE+8(%rsp), %r9 + movq TAB+NUMSIZE+16(%rsp), %r10 + movq TAB+NUMSIZE+24(%rsp), %r11 + movq TAB+NUMSIZE+32(%rsp), %r12 + movq TAB+NUMSIZE+40(%rsp), %r13 + movq TAB+NUMSIZE+48(%rsp), %r14 + movq TAB+NUMSIZE+56(%rsp), %r15 + movq TAB+NUMSIZE+64(%rsp), %rax + + movq %r8, %rbx + movq %r12, %rbp + orq %r9, %rbx + orq %r13, %rbp + orq %r10, %rbx + orq %r14, %rbp + orq %r11, %rbx + orq %r15, %rbp + orq %rbp, %rbx + orq %rax, %rbx + cmovzq %rbx, %rcx + + xorq %rcx, %r8 + xorq %rcx, %r9 + xorq %rcx, %r10 + xorq %rcx, %r11 + xorq %rcx, %r12 + xorq %rcx, %r13 + xorq %rcx, %r14 + xorq %rcx, %r15 + andq $0x1FF, %rcx + xorq %rcx, %rax + + movq %r8, TAB+NUMSIZE(%rsp) + movq %r9, TAB+NUMSIZE+8(%rsp) + movq %r10, TAB+NUMSIZE+16(%rsp) + movq %r11, TAB+NUMSIZE+24(%rsp) + movq %r12, TAB+NUMSIZE+32(%rsp) + movq %r13, TAB+NUMSIZE+40(%rsp) + movq %r14, TAB+NUMSIZE+48(%rsp) + movq %r15, TAB+NUMSIZE+56(%rsp) + movq %rax, TAB+NUMSIZE+64(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P + + leaq TAB+JACSIZE*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*2(%rsp), %rdi + leaq TAB+JACSIZE*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*3(%rsp), %rdi + leaq TAB+JACSIZE*1(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*4(%rsp), %rdi + leaq TAB+JACSIZE*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*5(%rsp), %rdi + leaq TAB+JACSIZE*2(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*6(%rsp), %rdi + leaq TAB+JACSIZE*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*7(%rsp), %rdi + leaq TAB+JACSIZE*3(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*8(%rsp), %rdi + leaq TAB+JACSIZE*7(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*9(%rsp), %rdi + leaq TAB+JACSIZE*4(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*10(%rsp), %rdi + leaq TAB+JACSIZE*9(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*11(%rsp), %rdi + leaq TAB+JACSIZE*5(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*12(%rsp), %rdi + leaq TAB+JACSIZE*11(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*13(%rsp), %rdi + leaq TAB+JACSIZE*6(%rsp), %rsi + callq p521_jscalarmul_jdouble + + leaq TAB+JACSIZE*14(%rsp), %rdi + leaq TAB+JACSIZE*13(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_jadd + + leaq TAB+JACSIZE*15(%rsp), %rdi + leaq TAB+JACSIZE*7(%rsp), %rsi + callq p521_jscalarmul_jdouble + +// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed +// digits. The digits of the constant, in lowest-to-highest order, are as +// follows; they are generated dynamically to use fewer large constant loads. +// +// 0x0842108421084210 %rax +// 0x1084210842108421 %rbx +// 0x2108421084210842 %rbx<<1 +// 0x4210842108421084 %rbx<<2 +// 0x8421084210842108 %rbx<<3 +// 0x0842108421084210 %rax +// 0x1084210842108421 %rbx +// 0x2108421084210842 %rbx<<1 +// 0x0000000000000084 + + movq $0x1084210842108421, %rax + movq %rax, %rbx + shrq $1, %rax + movq SCALARB(%rsp), %r8 + addq %rax, %r8 + movq SCALARB+8(%rsp), %r9 + adcq %rbx, %r9 + leaq (%rbx,%rbx), %rcx + movq SCALARB+16(%rsp), %r10 + adcq %rcx, %r10 + leaq (%rcx,%rcx), %rcx + movq SCALARB+24(%rsp), %r11 + adcq %rcx, %r11 + leaq (%rcx,%rcx), %rcx + movq SCALARB+32(%rsp), %r12 + adcq %rcx, %r12 + movq SCALARB+40(%rsp), %r13 + adcq %rax, %r13 + movq SCALARB+48(%rsp), %r14 + adcq %rbx, %r14 + movq SCALARB+56(%rsp), %r15 + leaq (%rbx,%rbx), %rcx + adcq %rcx, %r15 + movq SCALARB+64(%rsp), %rax + adcq $0x84, %rax + +// Because of the initial reduction the top bitfield (>= bits 520) is <= 1, +// i.e. just a single bit. Record that in %rdi, then shift the whole +// scalar left 56 bits to align the top of the next bitfield with the MSB +// (bits 571..575). + + movq %rax, %rdi + shrq $8, %rdi + shldq $56, %r15, %rax + shldq $56, %r14, %r15 + shldq $56, %r13, %r14 + shldq $56, %r12, %r13 + shldq $56, %r11, %r12 + shldq $56, %r10, %r11 + shldq $56, %r9, %r10 + shldq $56, %r8, %r9 + shlq $56, %r8 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + +// According to the top bit, initialize the accumulator to P or 0. This top +// digit, uniquely, is not recoded so there is no sign adjustment to make. +// We only really need to adjust the z coordinate to zero, but do all three. + + xorl %ecx, %ecx + testq %rdi, %rdi + + movq TAB(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC(%rsp) + movq TAB+8(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+8(%rsp) + movq TAB+16(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+16(%rsp) + movq TAB+24(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+24(%rsp) + movq TAB+32(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+32(%rsp) + movq TAB+40(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+40(%rsp) + movq TAB+48(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+48(%rsp) + movq TAB+56(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+56(%rsp) + movq TAB+64(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+64(%rsp) + movq TAB+72(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+72(%rsp) + movq TAB+80(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+80(%rsp) + movq TAB+88(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+88(%rsp) + movq TAB+96(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+96(%rsp) + movq TAB+104(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+104(%rsp) + movq TAB+112(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+112(%rsp) + movq TAB+120(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+120(%rsp) + movq TAB+128(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+128(%rsp) + movq TAB+136(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+136(%rsp) + movq TAB+144(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+144(%rsp) + movq TAB+152(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+152(%rsp) + movq TAB+160(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+160(%rsp) + movq TAB+168(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+168(%rsp) + movq TAB+176(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+176(%rsp) + movq TAB+184(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+184(%rsp) + movq TAB+192(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+192(%rsp) + movq TAB+200(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+200(%rsp) + movq TAB+208(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+208(%rsp) + +// Main loop over size-5 bitfields: double 5 times then add signed digit +// At each stage we shift the scalar left by 5 bits so we can simply pick +// the top 5 bits as the bitfield, saving some fiddle over indexing. + + movl $520, %ebp + +p521_jscalarmul_mainloop: + subq $5, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jdouble + +// Choose the bitfield and adjust it to sign and magnitude + + movq SCALARB(%rsp), %r8 + movq SCALARB+8(%rsp), %r9 + movq SCALARB+16(%rsp), %r10 + movq SCALARB+24(%rsp), %r11 + movq SCALARB+32(%rsp), %r12 + movq SCALARB+40(%rsp), %r13 + movq SCALARB+48(%rsp), %r14 + movq SCALARB+56(%rsp), %r15 + movq SCALARB+64(%rsp), %rax + + + movq %rax, %rdi + shrq $59, %rdi + + shldq $5, %r15, %rax + shldq $5, %r14, %r15 + shldq $5, %r13, %r14 + shldq $5, %r12, %r13 + shldq $5, %r11, %r12 + shldq $5, %r10, %r11 + shldq $5, %r9, %r10 + shldq $5, %r8, %r9 + shlq $5, %r8 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + + subq $16, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time +// Again, this is done in separate sweeps per coordinate, doing y last. + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,0) + selectblock(2,0) + selectblock(3,0) + selectblock(4,0) + selectblock(5,0) + selectblock(6,0) + selectblock(7,0) + selectblock(8,0) + selectblock(9,0) + selectblock(10,0) + selectblock(11,0) + selectblock(12,0) + selectblock(13,0) + selectblock(14,0) + selectblock(15,0) + selectblock(16,0) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + movq %r12, TABENT+64(%rsp) + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,2) + selectblock(2,2) + selectblock(3,2) + selectblock(4,2) + selectblock(5,2) + selectblock(6,2) + selectblock(7,2) + selectblock(8,2) + selectblock(9,2) + selectblock(10,2) + selectblock(11,2) + selectblock(12,2) + selectblock(13,2) + selectblock(14,2) + selectblock(15,2) + selectblock(16,2) + movq %rax, TABENT+2*NUMSIZE(%rsp) + movq %rbx, TABENT+2*NUMSIZE+8(%rsp) + movq %rcx, TABENT+2*NUMSIZE+16(%rsp) + movq %rdx, TABENT+2*NUMSIZE+24(%rsp) + movq %r8, TABENT+2*NUMSIZE+32(%rsp) + movq %r9, TABENT+2*NUMSIZE+40(%rsp) + movq %r10, TABENT+2*NUMSIZE+48(%rsp) + movq %r11, TABENT+2*NUMSIZE+56(%rsp) + movq %r12, TABENT+2*NUMSIZE+64(%rsp) + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,1) + selectblock(2,1) + selectblock(3,1) + selectblock(4,1) + selectblock(5,1) + selectblock(6,1) + selectblock(7,1) + selectblock(8,1) + selectblock(9,1) + selectblock(10,1) + selectblock(11,1) + selectblock(12,1) + selectblock(13,1) + selectblock(14,1) + selectblock(15,1) + selectblock(16,1) + +// Store it to "tabent" with the y coordinate optionally negated. +// This is done carefully to give coordinates < p_521 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, %r13 + orq %rbx, %r13 + movq %rcx, %r14 + orq %rdx, %r14 + movq %r8, %r15 + orq %r9, %r15 + movq %r10, %rdi + orq %r11, %rdi + orq %r14, %r13 + orq %rdi, %r15 + orq %r12, %r15 + orq %r15, %r13 + cmovzq %r13, %rsi + + xorq %rsi, %rax + xorq %rsi, %rbx + xorq %rsi, %rcx + xorq %rsi, %rdx + xorq %rsi, %r8 + xorq %rsi, %r9 + xorq %rsi, %r10 + xorq %rsi, %r11 + andq $0x1FF, %rsi + xorq %rsi, %r12 + + movq %rax, TABENT+NUMSIZE(%rsp) + movq %rbx, TABENT+NUMSIZE+8(%rsp) + movq %rcx, TABENT+NUMSIZE+16(%rsp) + movq %rdx, TABENT+NUMSIZE+24(%rsp) + movq %r8, TABENT+NUMSIZE+32(%rsp) + movq %r9, TABENT+NUMSIZE+40(%rsp) + movq %r10, TABENT+NUMSIZE+48(%rsp) + movq %r11, TABENT+NUMSIZE+56(%rsp) + movq %r12, TABENT+NUMSIZE+64(%rsp) + +// Add to the accumulator + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_jadd + + testq %rbp, %rbp + jne p521_jscalarmul_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + movq ACC+96(%rsp), %rax + movq %rax, 96(%rdi) + movq ACC+104(%rsp), %rax + movq %rax, 104(%rdi) + movq ACC+112(%rsp), %rax + movq %rax, 112(%rdi) + movq ACC+120(%rsp), %rax + movq %rax, 120(%rdi) + movq ACC+128(%rsp), %rax + movq %rax, 128(%rdi) + movq ACC+136(%rsp), %rax + movq %rax, 136(%rdi) + movq ACC+144(%rsp), %rax + movq %rax, 144(%rdi) + movq ACC+152(%rsp), %rax + movq %rax, 152(%rdi) + movq ACC+160(%rsp), %rax + movq %rax, 160(%rdi) + movq ACC+168(%rsp), %rax + movq %rax, 168(%rdi) + movq ACC+176(%rsp), %rax + movq %rax, 176(%rdi) + movq ACC+184(%rsp), %rax + movq %rax, 184(%rdi) + movq ACC+192(%rsp), %rax + movq %rax, 192(%rdi) + movq ACC+200(%rsp), %rax + movq %rax, 200(%rdi) + movq ACC+208(%rsp), %rax + movq %rax, 208(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p521_jscalarmul_bignum_mod_p521_9: + pushq %rbx + movq 0x40(%rsi), %rax + movl $0x1ff, %edx + andq %rax, %rdx + shrq $0x9, %rax + stc + adcq (%rsi), %rax + movq 0x8(%rsi), %rcx + adcq $0x0, %rcx + movq 0x10(%rsi), %r8 + adcq $0x0, %r8 + movq 0x18(%rsi), %r9 + adcq $0x0, %r9 + movq 0x20(%rsi), %r10 + adcq $0x0, %r10 + movq 0x28(%rsi), %r11 + adcq $0x0, %r11 + movq 0x30(%rsi), %rbx + adcq $0x0, %rbx + movq 0x38(%rsi), %rsi + adcq $0x0, %rsi + adcq $0x0, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, (%rdi) + sbbq $0x0, %rcx + movq %rcx, 0x8(%rdi) + sbbq $0x0, %r8 + movq %r8, 0x10(%rdi) + sbbq $0x0, %r9 + movq %r9, 0x18(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x20(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x28(%rdi) + sbbq $0x0, %rbx + movq %rbx, 0x30(%rdi) + sbbq $0x0, %rsi + movq %rsi, 0x38(%rdi) + sbbq $0x0, %rdx + andq $0x1ff, %rdx + movq %rdx, 0x40(%rdi) + popq %rbx + ret + +p521_jscalarmul_bignum_mod_n521_9: + movq 0x40(%rsi), %rdx + movq $0xfffffffffffffe00, %rax + orq %rdx, %rax + movq %rax, 0x40(%rdi) + shrq $0x9, %rdx + addq $0x1, %rdx + movq $0x449048e16ec79bf7, %r9 + mulxq %r9, %rax, %rcx + adcxq (%rsi), %rax + movq %rax, (%rdi) + movq $0xc44a36477663b851, %r10 + mulxq %r10, %rax, %r8 + adcxq 0x8(%rsi), %rax + adoxq %rcx, %rax + movq %rax, 0x8(%rdi) + movq $0x8033feb708f65a2f, %r11 + mulxq %r11, %rax, %rcx + adcxq 0x10(%rsi), %rax + adoxq %r8, %rax + movq %rax, 0x10(%rdi) + movq $0xae79787c40d06994, %rax + mulxq %rax, %rax, %r8 + adcxq 0x18(%rsi), %rax + adoxq %rcx, %rax + movq %rax, 0x18(%rdi) + movl $0x5, %eax + mulxq %rax, %rax, %rcx + adcxq 0x20(%rsi), %rax + adoxq %r8, %rax + movq %rax, 0x20(%rdi) + movq %rcx, %rax + adoxq %rcx, %rcx + adcq 0x28(%rsi), %rcx + movq %rcx, 0x28(%rdi) + movq 0x30(%rsi), %rcx + adcq %rax, %rcx + movq %rcx, 0x30(%rdi) + movq 0x38(%rsi), %rcx + adcq %rax, %rcx + movq %rcx, 0x38(%rdi) + movq 0x40(%rdi), %rcx + adcq %rax, %rcx + cmc + sbbq %rdx, %rdx + andq %rdx, %r9 + andq %rdx, %r10 + andq %rdx, %r11 + movq $0xae79787c40d06994, %r8 + andq %rdx, %r8 + andl $0x5, %edx + subq %r9, (%rdi) + sbbq %r10, 0x8(%rdi) + sbbq %r11, 0x10(%rdi) + sbbq %r8, 0x18(%rdi) + sbbq %rdx, 0x20(%rdi) + sbbq %rax, 0x28(%rdi) + sbbq %rax, 0x30(%rdi) + sbbq %rax, 0x38(%rdi) + sbbl %eax, %ecx + andl $0x1ff, %ecx + movq %rcx, 0x40(%rdi) + ret + +p521_jscalarmul_jadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x210, %rsp + movq %rdi, 0x1f8(%rsp) + movq %rsi, 0x200(%rsp) + movq %rdx, 0x208(%rsp) + movq 0x200(%rsp), %rsi + leaq 0x90(%rsi), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x208(%rsp), %rdi + leaq 0x90(%rdi), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x200(%rsp), %rsi + movq 0x208(%rsp), %rdi + leaq 0x48(%rsi), %rdx + leaq 0x90(%rdi), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x200(%rsp), %rsi + movq 0x208(%rsp), %rdi + leaq 0x48(%rdi), %rdx + leaq 0x90(%rsi), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x208(%rsp), %rdi + leaq (%rdi), %rdx + leaq (%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x200(%rsp), %rsi + leaq (%rsi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + leaq 0x48(%rsp), %rdx + leaq (%rsp), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + leaq 0x1b0(%rsp), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x90(%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x98(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0xa0(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0xa8(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0xb0(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0xb8(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0xc0(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0xc8(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0xd0(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x1a8(%rsp) + movq 0x48(%rsp), %rax + subq 0x1b0(%rsp), %rax + movq 0x50(%rsp), %rdx + sbbq 0x1b8(%rsp), %rdx + movq 0x58(%rsp), %r8 + sbbq 0x1c0(%rsp), %r8 + movq 0x60(%rsp), %r9 + sbbq 0x1c8(%rsp), %r9 + movq 0x68(%rsp), %r10 + sbbq 0x1d0(%rsp), %r10 + movq 0x70(%rsp), %r11 + sbbq 0x1d8(%rsp), %r11 + movq 0x78(%rsp), %r12 + sbbq 0x1e0(%rsp), %r12 + movq 0x80(%rsp), %r13 + sbbq 0x1e8(%rsp), %r13 + movq 0x88(%rsp), %r14 + sbbq 0x1f0(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x48(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x50(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x58(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x60(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x68(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x70(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x78(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x80(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x88(%rsp) + leaq 0x168(%rsp), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + leaq 0x48(%rsp), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + leaq 0x120(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + leaq 0x90(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq (%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x8(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0x10(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0x20(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0x28(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0x30(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0x38(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0x40(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, (%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x8(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x10(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x18(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x20(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x28(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x30(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x38(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x40(%rsp) + movq 0x90(%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x98(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0xa0(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0xa8(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0xb0(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0xb8(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0xc0(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0xc8(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0xd0(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0xd8(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0xe0(%rsp) + sbbq $0x0, %r8 + movq %r8, 0xe8(%rsp) + sbbq $0x0, %r9 + movq %r9, 0xf0(%rsp) + sbbq $0x0, %r10 + movq %r10, 0xf8(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x100(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x108(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x110(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x118(%rsp) + movq 0x200(%rsp), %rsi + leaq 0x90(%rsi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq (%rsp), %rax + subq 0x90(%rsp), %rax + movq 0x8(%rsp), %rdx + sbbq 0x98(%rsp), %rdx + movq 0x10(%rsp), %r8 + sbbq 0xa0(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0xa8(%rsp), %r9 + movq 0x20(%rsp), %r10 + sbbq 0xb0(%rsp), %r10 + movq 0x28(%rsp), %r11 + sbbq 0xb8(%rsp), %r11 + movq 0x30(%rsp), %r12 + sbbq 0xc0(%rsp), %r12 + movq 0x38(%rsp), %r13 + sbbq 0xc8(%rsp), %r13 + movq 0x40(%rsp), %r14 + sbbq 0xd0(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, (%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x8(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x10(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x18(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x20(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x28(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x30(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x38(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x40(%rsp) + movq 0x120(%rsp), %rax + subq (%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + leaq 0x1b0(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x208(%rsp), %rdi + leaq 0x90(%rdi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + leaq 0x120(%rsp), %rdx + leaq 0x48(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x120(%rsp), %rax + subq 0xd8(%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0xe0(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0xe8(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0xf0(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0xf8(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x100(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x108(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x110(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x118(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + movq 0x200(%rsp), %rsi + movq 0x90(%rsi), %r8 + movq 0x98(%rsi), %r9 + movq 0xa0(%rsi), %r10 + movq 0xa8(%rsi), %r11 + movq 0xb0(%rsi), %r12 + movq 0xb8(%rsi), %r13 + movq 0xc0(%rsi), %r14 + movq 0xc8(%rsi), %r15 + movq 0xd0(%rsi), %rbp + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + movq 0x208(%rsp), %rdi + movq 0x90(%rdi), %r8 + movq 0x98(%rdi), %r9 + movq 0xa0(%rdi), %r10 + movq 0xa8(%rdi), %r11 + movq 0xb0(%rdi), %r12 + movq 0xb8(%rdi), %r13 + movq 0xc0(%rdi), %r14 + movq 0xc8(%rdi), %r15 + movq 0xd0(%rdi), %rbp + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + cmpq %rax, %rdx + movq 0x120(%rsp), %r8 + cmovbq 0x48(%rsi), %r8 + cmova 0x48(%rdi), %r8 + movq 0x128(%rsp), %r9 + cmovbq 0x50(%rsi), %r9 + cmova 0x50(%rdi), %r9 + movq 0x130(%rsp), %r10 + cmovbq 0x58(%rsi), %r10 + cmova 0x58(%rdi), %r10 + movq 0x138(%rsp), %r11 + cmovbq 0x60(%rsi), %r11 + cmova 0x60(%rdi), %r11 + movq 0x140(%rsp), %r12 + cmovbq 0x68(%rsi), %r12 + cmova 0x68(%rdi), %r12 + movq 0x148(%rsp), %r13 + cmovbq 0x70(%rsi), %r13 + cmova 0x70(%rdi), %r13 + movq 0x150(%rsp), %r14 + cmovbq 0x78(%rsi), %r14 + cmova 0x78(%rdi), %r14 + movq 0x158(%rsp), %r15 + cmovbq 0x80(%rsi), %r15 + cmova 0x80(%rdi), %r15 + movq 0x160(%rsp), %rbp + cmovbq 0x88(%rsi), %rbp + cmova 0x88(%rdi), %rbp + movq %r8, 0x120(%rsp) + movq %r9, 0x128(%rsp) + movq %r10, 0x130(%rsp) + movq %r11, 0x138(%rsp) + movq %r12, 0x140(%rsp) + movq %r13, 0x148(%rsp) + movq %r14, 0x150(%rsp) + movq %r15, 0x158(%rsp) + movq %rbp, 0x160(%rsp) + movq 0x168(%rsp), %r8 + cmovbq 0x90(%rsi), %r8 + cmova 0x90(%rdi), %r8 + movq 0x170(%rsp), %r9 + cmovbq 0x98(%rsi), %r9 + cmova 0x98(%rdi), %r9 + movq 0x178(%rsp), %r10 + cmovbq 0xa0(%rsi), %r10 + cmova 0xa0(%rdi), %r10 + movq 0x180(%rsp), %r11 + cmovbq 0xa8(%rsi), %r11 + cmova 0xa8(%rdi), %r11 + movq 0x188(%rsp), %r12 + cmovbq 0xb0(%rsi), %r12 + cmova 0xb0(%rdi), %r12 + movq 0x190(%rsp), %r13 + cmovbq 0xb8(%rsi), %r13 + cmova 0xb8(%rdi), %r13 + movq 0x198(%rsp), %r14 + cmovbq 0xc0(%rsi), %r14 + cmova 0xc0(%rdi), %r14 + movq 0x1a0(%rsp), %r15 + cmovbq 0xc8(%rsi), %r15 + cmova 0xc8(%rdi), %r15 + movq 0x1a8(%rsp), %rbp + cmovbq 0xd0(%rsi), %rbp + cmova 0xd0(%rdi), %rbp + movq %r8, 0x168(%rsp) + movq %r9, 0x170(%rsp) + movq %r10, 0x178(%rsp) + movq %r11, 0x180(%rsp) + movq %r12, 0x188(%rsp) + movq %r13, 0x190(%rsp) + movq %r14, 0x198(%rsp) + movq %r15, 0x1a0(%rsp) + movq %rbp, 0x1a8(%rsp) + movq (%rsp), %r8 + cmovbq (%rsi), %r8 + cmova (%rdi), %r8 + movq 0x8(%rsp), %r9 + cmovbq 0x8(%rsi), %r9 + cmova 0x8(%rdi), %r9 + movq 0x10(%rsp), %r10 + cmovbq 0x10(%rsi), %r10 + cmova 0x10(%rdi), %r10 + movq 0x18(%rsp), %r11 + cmovbq 0x18(%rsi), %r11 + cmova 0x18(%rdi), %r11 + movq 0x20(%rsp), %r12 + cmovbq 0x20(%rsi), %r12 + cmova 0x20(%rdi), %r12 + movq 0x28(%rsp), %r13 + cmovbq 0x28(%rsi), %r13 + cmova 0x28(%rdi), %r13 + movq 0x30(%rsp), %r14 + cmovbq 0x30(%rsi), %r14 + cmova 0x30(%rdi), %r14 + movq 0x38(%rsp), %r15 + cmovbq 0x38(%rsi), %r15 + cmova 0x38(%rdi), %r15 + movq 0x40(%rsp), %rbp + cmovbq 0x40(%rsi), %rbp + cmova 0x40(%rdi), %rbp + movq 0x1f8(%rsp), %rdi + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq %rbp, 0x40(%rdi) + movq 0x120(%rsp), %rax + movq %rax, 0x48(%rdi) + movq 0x128(%rsp), %rax + movq %rax, 0x50(%rdi) + movq 0x130(%rsp), %rax + movq %rax, 0x58(%rdi) + movq 0x138(%rsp), %rax + movq %rax, 0x60(%rdi) + movq 0x140(%rsp), %rax + movq %rax, 0x68(%rdi) + movq 0x148(%rsp), %rax + movq %rax, 0x70(%rdi) + movq 0x150(%rsp), %rax + movq %rax, 0x78(%rdi) + movq 0x158(%rsp), %rax + movq %rax, 0x80(%rdi) + movq 0x160(%rsp), %rax + movq %rax, 0x88(%rdi) + movq 0x168(%rsp), %rax + movq %rax, 0x90(%rdi) + movq 0x170(%rsp), %rax + movq %rax, 0x98(%rdi) + movq 0x178(%rsp), %rax + movq %rax, 0xa0(%rdi) + movq 0x180(%rsp), %rax + movq %rax, 0xa8(%rdi) + movq 0x188(%rsp), %rax + movq %rax, 0xb0(%rdi) + movq 0x190(%rsp), %rax + movq %rax, 0xb8(%rdi) + movq 0x198(%rsp), %rax + movq %rax, 0xc0(%rdi) + movq 0x1a0(%rsp), %rax + movq %rax, 0xc8(%rdi) + movq 0x1a8(%rsp), %rax + movq %rax, 0xd0(%rdi) + addq $0x210, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p521_jscalarmul_jdouble: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x208, %rsp + movq %rdi, 0x1f8(%rsp) + movq %rsi, 0x200(%rsp) + movq 0x200(%rsp), %rdi + leaq 0x90(%rdi), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x200(%rsp), %rdi + leaq 0x48(%rdi), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x200(%rsp), %rdi + stc + movq (%rdi), %rax + adcq (%rsp), %rax + movq 0x8(%rdi), %rbx + adcq 0x8(%rsp), %rbx + movq 0x10(%rdi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rdi), %r9 + adcq 0x18(%rsp), %r9 + movq 0x20(%rdi), %r10 + adcq 0x20(%rsp), %r10 + movq 0x28(%rdi), %r11 + adcq 0x28(%rsp), %r11 + movq 0x30(%rdi), %r12 + adcq 0x30(%rsp), %r12 + movq 0x38(%rdi), %r13 + adcq 0x38(%rsp), %r13 + movq 0x40(%rdi), %r14 + adcq 0x40(%rsp), %r14 + movq $0x200, %rdx + andq %r14, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rbx + movq %rbx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq %rdx, %r14 + movq %r14, 0x1a8(%rsp) + movq 0x200(%rsp), %rdi + movq (%rdi), %rax + subq (%rsp), %rax + movq 0x8(%rdi), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x10(%rdi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rdi), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x20(%rdi), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x28(%rdi), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x30(%rdi), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x38(%rdi), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x40(%rdi), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + leaq 0x120(%rsp), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x200(%rsp), %rdi + stc + movq 0x48(%rdi), %rax + adcq 0x90(%rdi), %rax + movq 0x50(%rdi), %rbx + adcq 0x98(%rdi), %rbx + movq 0x58(%rdi), %r8 + adcq 0xa0(%rdi), %r8 + movq 0x60(%rdi), %r9 + adcq 0xa8(%rdi), %r9 + movq 0x68(%rdi), %r10 + adcq 0xb0(%rdi), %r10 + movq 0x70(%rdi), %r11 + adcq 0xb8(%rdi), %r11 + movq 0x78(%rdi), %r12 + adcq 0xc0(%rdi), %r12 + movq 0x80(%rdi), %r13 + adcq 0xc8(%rdi), %r13 + movq 0x88(%rdi), %r14 + adcq 0xd0(%rdi), %r14 + movq $0x200, %rdx + andq %r14, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rbx + movq %rbx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq %rdx, %r14 + movq %r14, 0x1a8(%rsp) + leaq 0x90(%rsp), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x200(%rsp), %rdi + leaq 0x48(%rsp), %rdx + leaq (%rdi), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + leaq 0x168(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq $0x9, %rdx + movq 0x1f0(%rsp), %rbx + xorq $0x1ff, %rbx + movq 0x1b0(%rsp), %rax + notq %rax + mulxq %rax, %r8, %r9 + movq 0x1b8(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r10 + addq %rax, %r9 + movq 0x1c0(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r11 + adcq %rax, %r10 + movq 0x1c8(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r12 + adcq %rax, %r11 + movq 0x1d0(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r13 + adcq %rax, %r12 + movq 0x1d8(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r14 + adcq %rax, %r13 + movq 0x1e0(%rsp), %rax + notq %rax + mulxq %rax, %rax, %r15 + adcq %rax, %r14 + movq 0x1e8(%rsp), %rax + notq %rax + mulxq %rax, %rax, %rcx + adcq %rax, %r15 + mulxq %rbx, %rbx, %rax + adcq %rcx, %rbx + xorl %eax, %eax + movq $0xc, %rdx + mulxq 0xd8(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0xe0(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0xe8(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0xf0(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0xf8(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + mulxq 0x100(%rsp), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq 0x108(%rsp), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + mulxq 0x110(%rsp), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + mulxq 0x118(%rsp), %rax, %rcx + adcxq %rax, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, 0x1b0(%rsp) + adcq %rcx, %r9 + movq %r9, 0x1b8(%rsp) + adcq %rcx, %r10 + movq %r10, 0x1c0(%rsp) + adcq %rcx, %r11 + movq %r11, 0x1c8(%rsp) + adcq %rcx, %r12 + movq %r12, 0x1d0(%rsp) + adcq %rcx, %r13 + movq %r13, 0x1d8(%rsp) + adcq %rcx, %r14 + movq %r14, 0x1e0(%rsp) + adcq %rcx, %r15 + movq %r15, 0x1e8(%rsp) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x1f0(%rsp) + movq 0x120(%rsp), %rax + subq (%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x1a8(%rsp) + leaq 0x48(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_sqr_p521 + movq 0x1f8(%rsp), %rdi + movq 0x168(%rsp), %rax + subq 0x48(%rsp), %rax + movq 0x170(%rsp), %rdx + sbbq 0x50(%rsp), %rdx + movq 0x178(%rsp), %r8 + sbbq 0x58(%rsp), %r8 + movq 0x180(%rsp), %r9 + sbbq 0x60(%rsp), %r9 + movq 0x188(%rsp), %r10 + sbbq 0x68(%rsp), %r10 + movq 0x190(%rsp), %r11 + sbbq 0x70(%rsp), %r11 + movq 0x198(%rsp), %r12 + sbbq 0x78(%rsp), %r12 + movq 0x1a0(%rsp), %r13 + sbbq 0x80(%rsp), %r13 + movq 0x1a8(%rsp), %r14 + sbbq 0x88(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x90(%rdi) + sbbq $0x0, %rdx + movq %rdx, 0x98(%rdi) + sbbq $0x0, %r8 + movq %r8, 0xa0(%rdi) + sbbq $0x0, %r9 + movq %r9, 0xa8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0xb0(%rdi) + sbbq $0x0, %r11 + movq %r11, 0xb8(%rdi) + sbbq $0x0, %r12 + movq %r12, 0xc0(%rdi) + sbbq $0x0, %r13 + movq %r13, 0xc8(%rdi) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0xd0(%rdi) + leaq 0x90(%rsp), %rdx + leaq 0x1b0(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_mul_p521 + movq 0x1f8(%rsp), %rdi + movq 0x118(%rsp), %rbx + movq 0x110(%rsp), %r15 + shldq $0x2, %r15, %rbx + movq 0x108(%rsp), %r14 + shldq $0x2, %r14, %r15 + movq 0x100(%rsp), %r13 + shldq $0x2, %r13, %r14 + movq 0xf8(%rsp), %r12 + shldq $0x2, %r12, %r13 + movq 0xf0(%rsp), %r11 + shldq $0x2, %r11, %r12 + movq 0xe8(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0xe0(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0xd8(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + movq 0x1f0(%rsp), %rcx + xorq $0x1ff, %rcx + movq 0x1b0(%rsp), %rax + notq %rax + addq %rax, %r8 + movq 0x1b8(%rsp), %rax + notq %rax + adcq %rax, %r9 + movq 0x1c0(%rsp), %rax + notq %rax + adcq %rax, %r10 + movq 0x1c8(%rsp), %rax + notq %rax + adcq %rax, %r11 + movq 0x1d0(%rsp), %rax + notq %rax + adcq %rax, %r12 + movq 0x1d8(%rsp), %rax + notq %rax + adcq %rax, %r13 + movq 0x1e0(%rsp), %rax + notq %rax + adcq %rax, %r14 + movq 0x1e8(%rsp), %rax + notq %rax + adcq %rax, %r15 + adcq %rcx, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, (%rdi) + adcq %rcx, %r9 + movq %r9, 0x8(%rdi) + adcq %rcx, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + adcq %rcx, %r12 + movq %r12, 0x20(%rdi) + adcq %rcx, %r13 + movq %r13, 0x28(%rdi) + adcq %rcx, %r14 + movq %r14, 0x30(%rdi) + adcq %rcx, %r15 + movq %r15, 0x38(%rdi) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x40(%rdi) + movq 0x1f8(%rsp), %rdi + movq 0x160(%rsp), %rbx + xorq $0x1ff, %rbx + movq 0x158(%rsp), %r15 + notq %r15 + shldq $0x3, %r15, %rbx + movq 0x150(%rsp), %r14 + notq %r14 + shldq $0x3, %r14, %r15 + movq 0x148(%rsp), %r13 + notq %r13 + shldq $0x3, %r13, %r14 + movq 0x140(%rsp), %r12 + notq %r12 + shldq $0x3, %r12, %r13 + movq 0x138(%rsp), %r11 + notq %r11 + shldq $0x3, %r11, %r12 + movq 0x130(%rsp), %r10 + notq %r10 + shldq $0x3, %r10, %r11 + movq 0x128(%rsp), %r9 + notq %r9 + shldq $0x3, %r9, %r10 + movq 0x120(%rsp), %r8 + notq %r8 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + movq $0x3, %rdx + xorl %eax, %eax + mulxq 0x168(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x170(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x178(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x180(%rsp), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x188(%rsp), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + mulxq 0x190(%rsp), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq 0x198(%rsp), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + mulxq 0x1a0(%rsp), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbx + mulxq 0x1a8(%rsp), %rax, %rcx + adcxq %rax, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, 0x48(%rdi) + adcq %rcx, %r9 + movq %r9, 0x50(%rdi) + adcq %rcx, %r10 + movq %r10, 0x58(%rdi) + adcq %rcx, %r11 + movq %r11, 0x60(%rdi) + adcq %rcx, %r12 + movq %r12, 0x68(%rdi) + adcq %rcx, %r13 + movq %r13, 0x70(%rdi) + adcq %rcx, %r14 + movq %r14, 0x78(%rdi) + adcq %rcx, %r15 + movq %r15, 0x80(%rdi) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x88(%rdi) + addq $0x208, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p521_jscalarmul_mul_p521: + subq $0x40, %rsp + movq %rdx, %rcx + xorl %ebp, %ebp + movq (%rcx), %rdx + mulxq (%rsi), %r8, %r9 + movq %r8, (%rsp) + mulxq 0x8(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsi), %rbx, %r12 + adcq %rbx, %r11 + mulxq 0x20(%rsi), %rbx, %r13 + adcq %rbx, %r12 + mulxq 0x28(%rsi), %rbx, %r14 + adcq %rbx, %r13 + mulxq 0x30(%rsi), %rbx, %r15 + adcq %rbx, %r14 + mulxq 0x38(%rsi), %rbx, %r8 + adcq %rbx, %r15 + adcq %rbp, %r8 + movq 0x8(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + movq %r9, 0x8(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + adcq %rbp, %r9 + movq 0x10(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq %r10, 0x10(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x38(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcq %rbp, %r10 + movq 0x18(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq %r11, 0x18(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x38(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + adcq %rbp, %r11 + movq 0x20(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq %r12, 0x20(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x38(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcq %rbp, %r12 + movq 0x28(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + movq %r13, 0x28(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + adcq %rbp, %r13 + movq 0x30(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + movq %r14, 0x30(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x38(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + movq 0x38(%rcx), %rdx + xorl %ebp, %ebp + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %r8 + movq %r15, 0x38(%rsp) + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x38(%rsi), %rax, %r15 + adcxq %rax, %r14 + adoxq %rbp, %r15 + adcq %rbp, %r15 + movq 0x40(%rsi), %rdx + xorl %ebp, %ebp + mulxq (%rcx), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x8(%rcx), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x10(%rcx), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x18(%rcx), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x20(%rcx), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x28(%rcx), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x30(%rcx), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x38(%rcx), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbp, %rbx + adcq %rbx, %rbp + movq 0x40(%rcx), %rdx + xorl %eax, %eax + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r8 + adoxq %rbx, %r9 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq 0x30(%rsi), %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + mulxq 0x38(%rsi), %rax, %rbx + adcxq %rax, %r15 + adoxq %rbx, %rbp + mulxq 0x40(%rsi), %rax, %rbx + adcq %rax, %rbp + movq %r8, %rax + andq $0x1ff, %rax + shrdq $0x9, %r9, %r8 + shrdq $0x9, %r10, %r9 + shrdq $0x9, %r11, %r10 + shrdq $0x9, %r12, %r11 + shrdq $0x9, %r13, %r12 + shrdq $0x9, %r14, %r13 + shrdq $0x9, %r15, %r14 + shrdq $0x9, %rbp, %r15 + shrq $0x9, %rbp + addq %rax, %rbp + stc + adcq (%rsp), %r8 + adcq 0x8(%rsp), %r9 + adcq 0x10(%rsp), %r10 + adcq 0x18(%rsp), %r11 + adcq 0x20(%rsp), %r12 + adcq 0x28(%rsp), %r13 + adcq 0x30(%rsp), %r14 + adcq 0x38(%rsp), %r15 + adcq $0xfffffffffffffe00, %rbp + cmc + sbbq $0x0, %r8 + movq %r8, (%rdi) + sbbq $0x0, %r9 + movq %r9, 0x8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x10(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x18(%rdi) + sbbq $0x0, %r12 + movq %r12, 0x20(%rdi) + sbbq $0x0, %r13 + movq %r13, 0x28(%rdi) + sbbq $0x0, %r14 + movq %r14, 0x30(%rdi) + sbbq $0x0, %r15 + movq %r15, 0x38(%rdi) + sbbq $0x0, %rbp + andq $0x1ff, %rbp + movq %rbp, 0x40(%rdi) + addq $0x40, %rsp + ret + +p521_jscalarmul_sqr_p521: + subq $0x40, %rsp + xorl %ebp, %ebp + movq (%rsi), %rdx + mulxq 0x8(%rsi), %r9, %rax + movq %r9, 0x8(%rsp) + mulxq 0x10(%rsi), %r10, %rcx + adcxq %rax, %r10 + movq %r10, 0x10(%rsp) + mulxq 0x18(%rsi), %r11, %rax + adcxq %rcx, %r11 + mulxq 0x20(%rsi), %r12, %rcx + adcxq %rax, %r12 + mulxq 0x28(%rsi), %r13, %rax + adcxq %rcx, %r13 + mulxq 0x30(%rsi), %r14, %rcx + adcxq %rax, %r14 + mulxq 0x38(%rsi), %r15, %r8 + adcxq %rcx, %r15 + adcxq %rbp, %r8 + xorl %ebp, %ebp + movq 0x8(%rsi), %rdx + mulxq 0x10(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + movq %r11, 0x18(%rsp) + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + movq %r12, 0x20(%rsp) + mulxq 0x20(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq 0x28(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + mulxq 0x30(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %r8 + mulxq 0x38(%rsi), %rax, %r9 + adcxq %rax, %r8 + adoxq %rbp, %r9 + movq 0x20(%rsi), %rdx + mulxq 0x28(%rsi), %rax, %r10 + adcxq %rax, %r9 + adoxq %rbp, %r10 + adcxq %rbp, %r10 + xorl %ebp, %ebp + movq 0x10(%rsi), %rdx + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + movq %r13, 0x28(%rsp) + mulxq 0x20(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + movq %r14, 0x30(%rsp) + mulxq 0x28(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %r8 + mulxq 0x30(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x38(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + movq 0x30(%rsi), %rdx + mulxq 0x20(%rsi), %rax, %r11 + adcxq %rax, %r10 + adoxq %rbp, %r11 + mulxq 0x28(%rsi), %rax, %r12 + adcxq %rax, %r11 + adoxq %rbp, %r12 + adcxq %rbp, %r12 + xorl %ebp, %ebp + movq 0x18(%rsi), %rdx + mulxq 0x20(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %r8 + movq %r15, 0x38(%rsp) + mulxq 0x28(%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x30(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x38(%rsi), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + movq 0x38(%rsi), %rdx + mulxq 0x20(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x28(%rsi), %rax, %r13 + adcxq %rax, %r12 + adoxq %rbp, %r13 + mulxq 0x30(%rsi), %rax, %r14 + adcxq %rax, %r13 + adoxq %rbp, %r14 + adcxq %rbp, %r14 + xorl %ebp, %ebp + movq (%rsi), %rdx + mulxq %rdx, %rax, %rcx + movq %rax, (%rsp) + movq 0x8(%rsp), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 0x8(%rsp) + movq 0x10(%rsp), %rax + movq 0x8(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 0x10(%rsp) + movq 0x18(%rsp), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 0x18(%rsp) + movq 0x20(%rsp), %rax + movq 0x10(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 0x20(%rsp) + movq 0x28(%rsp), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 0x28(%rsp) + movq 0x30(%rsp), %rax + movq 0x18(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %rax, %rax + adoxq %rdx, %rax + movq %rax, 0x30(%rsp) + movq 0x38(%rsp), %rax + adcxq %rax, %rax + adoxq %rcx, %rax + movq %rax, 0x38(%rsp) + movq 0x20(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %r8, %r8 + adoxq %rdx, %r8 + adcxq %r9, %r9 + adoxq %rcx, %r9 + movq 0x28(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %r10, %r10 + adoxq %rdx, %r10 + adcxq %r11, %r11 + adoxq %rcx, %r11 + movq 0x30(%rsi), %rdx + mulxq %rdx, %rdx, %rcx + adcxq %r12, %r12 + adoxq %rdx, %r12 + adcxq %r13, %r13 + adoxq %rcx, %r13 + movq 0x38(%rsi), %rdx + mulxq %rdx, %rdx, %r15 + adcxq %r14, %r14 + adoxq %rdx, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + movq 0x40(%rsi), %rdx + movq %rdx, %rbp + imulq %rbp, %rbp + addq %rdx, %rdx + mulxq (%rsi), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x8(%rsi), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x10(%rsi), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x18(%rsi), %rax, %rcx + adcxq %rax, %r11 + adoxq %rcx, %r12 + mulxq 0x20(%rsi), %rax, %rcx + adcxq %rax, %r12 + adoxq %rcx, %r13 + mulxq 0x28(%rsi), %rax, %rcx + adcxq %rax, %r13 + adoxq %rcx, %r14 + mulxq 0x30(%rsi), %rax, %rcx + adcxq %rax, %r14 + adoxq %rcx, %r15 + mulxq 0x38(%rsi), %rax, %rcx + adcxq %rax, %r15 + adoxq %rcx, %rbp + adcq $0x0, %rbp + movq %r8, %rax + andq $0x1ff, %rax + shrdq $0x9, %r9, %r8 + shrdq $0x9, %r10, %r9 + shrdq $0x9, %r11, %r10 + shrdq $0x9, %r12, %r11 + shrdq $0x9, %r13, %r12 + shrdq $0x9, %r14, %r13 + shrdq $0x9, %r15, %r14 + shrdq $0x9, %rbp, %r15 + shrq $0x9, %rbp + addq %rax, %rbp + stc + adcq (%rsp), %r8 + adcq 0x8(%rsp), %r9 + adcq 0x10(%rsp), %r10 + adcq 0x18(%rsp), %r11 + adcq 0x20(%rsp), %r12 + adcq 0x28(%rsp), %r13 + adcq 0x30(%rsp), %r14 + adcq 0x38(%rsp), %r15 + adcq $0xfffffffffffffe00, %rbp + cmc + sbbq $0x0, %r8 + movq %r8, (%rdi) + sbbq $0x0, %r9 + movq %r9, 0x8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x10(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x18(%rdi) + sbbq $0x0, %r12 + movq %r12, 0x20(%rdi) + sbbq $0x0, %r13 + movq %r13, 0x28(%rdi) + sbbq $0x0, %r14 + movq %r14, 0x30(%rdi) + sbbq $0x0, %r15 + movq %r15, 0x38(%rdi) + sbbq $0x0, %rbp + andq $0x1ff, %rbp + movq %rbp, 0x40(%rdi) + addq $0x40, %rsp + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S b/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S new file mode 100644 index 0000000000..ee0fca779b --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p521/p521_jscalarmul_alt.S @@ -0,0 +1,2805 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Jacobian form scalar multiplication for P-521 +// Input scalar[9], point[27]; output res[27] +// +// extern void p521_jscalarmul_alt +// (uint64_t res[static 27], +// uint64_t scalar[static 9], +// uint64_t point[static 27]); +// +// This function is a variant of its affine point version p521_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// a triple (x,y,z) representing the affine point (x/z^2,y/z^3) when +// z is nonzero or the point at infinity (group identity) if z = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-521, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_521) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p521_jscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p521_jscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 72 +#define JACSIZE (3*NUMSIZE) + +// Intermediate variables on the stack. +// The table is 16 entries, each of size JACSIZE = 3 * NUMSIZE +// Uppercase syntactic variants make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (55*NUMSIZE)(%rsp) + +#define NSPACE (56*NUMSIZE) + +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I,C) \ + cmpq $I, %rdi ; \ + cmovzq TAB+JACSIZE*(I-1)+C*NUMSIZE(%rsp), %rax ; \ + cmovzq TAB+JACSIZE*(I-1)+8+C*NUMSIZE(%rsp), %rbx ; \ + cmovzq TAB+JACSIZE*(I-1)+16+C*NUMSIZE(%rsp), %rcx ; \ + cmovzq TAB+JACSIZE*(I-1)+24+C*NUMSIZE(%rsp), %rdx ; \ + cmovzq TAB+JACSIZE*(I-1)+32+C*NUMSIZE(%rsp), %r8 ; \ + cmovzq TAB+JACSIZE*(I-1)+40+C*NUMSIZE(%rsp), %r9 ; \ + cmovzq TAB+JACSIZE*(I-1)+48+C*NUMSIZE(%rsp), %r10 ; \ + cmovzq TAB+JACSIZE*(I-1)+56+C*NUMSIZE(%rsp), %r11 ; \ + cmovzq TAB+JACSIZE*(I-1)+64+C*NUMSIZE(%rsp), %r12 + +S2N_BN_SYMBOL(p521_jscalarmul_alt): + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p521_jscalarmul_alt_standard + popq %rsi + popq %rdi + ret + +p521_jscalarmul_alt_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" input argument; others get processed early. + + movq %rdi, res + +// Reduce the input scalar mod n_521 and store it to "scalarb". + + movq %rdx, %rbx + leaq SCALARB(%rsp), %rdi + callq p521_jscalarmul_alt_bignum_mod_n521_9 + +// Set the tab[0] table entry to the input point = 1 * P, but also +// reduce all coordinates modulo p. In principle we assume reduction +// as a precondition, but this reduces the scope for surprise, e.g. +// making sure that any input with z = 0 is treated as zero, even +// if the other coordinates are not in fact reduced. + + leaq TAB(%rsp), %rdi + movq %rbx, %rsi + callq p521_jscalarmul_alt_bignum_mod_p521_9 + + leaq TAB+NUMSIZE(%rsp), %rdi + leaq NUMSIZE(%rbx), %rsi + callq p521_jscalarmul_alt_bignum_mod_p521_9 + + leaq TAB+2*NUMSIZE(%rsp), %rdi + leaq 2*NUMSIZE(%rbx), %rsi + callq p521_jscalarmul_alt_bignum_mod_p521_9 + +// If bit 520 of the scalar is set, then negate the scalar mod n_521, +// i.e. do scalar |-> n_521 - scalar, and also the point to compensate +// by negating its y coordinate. This further step is not needed by +// the indexing scheme (the top window is only a couple of bits either +// way), but is convenient to exclude a problem with the specific value +// scalar = n_521 - 18, where the last Jacobian addition is of the form +// (n_521 - 9) * P + -(9 * P) and hence is a degenerate doubling case. + + xorl %eax, %eax + notq %rax + movq $0xbb6fb71e91386409, %r8 + subq SCALARB(%rsp), %r8 + movq $0x3bb5c9b8899c47ae, %r9 + sbbq SCALARB+8(%rsp), %r9 + movq $0x7fcc0148f709a5d0, %r10 + sbbq SCALARB+16(%rsp), %r10 + movq $0x51868783bf2f966b, %r11 + sbbq SCALARB+24(%rsp), %r11 + leaq -5(%rax), %r12 + sbbq SCALARB+32(%rsp), %r12 + movq %rax, %r13 + sbbq SCALARB+40(%rsp), %r13 + movq %rax, %r14 + sbbq SCALARB+48(%rsp), %r14 + movq %rax, %r15 + sbbq SCALARB+56(%rsp), %r15 + movq $0x1ff, %rax + movq SCALARB+64(%rsp), %rcx + sbbq %rcx, %rax + + btq $8, %rcx + sbbq %rcx, %rcx + + cmovncq SCALARB(%rsp), %r8 + cmovncq SCALARB+8(%rsp), %r9 + cmovncq SCALARB+16(%rsp), %r10 + cmovncq SCALARB+24(%rsp), %r11 + cmovncq SCALARB+32(%rsp), %r12 + cmovncq SCALARB+40(%rsp), %r13 + cmovncq SCALARB+48(%rsp), %r14 + cmovncq SCALARB+56(%rsp), %r15 + cmovncq SCALARB+64(%rsp), %rax + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + + movq TAB+NUMSIZE(%rsp), %r8 + movq TAB+NUMSIZE+8(%rsp), %r9 + movq TAB+NUMSIZE+16(%rsp), %r10 + movq TAB+NUMSIZE+24(%rsp), %r11 + movq TAB+NUMSIZE+32(%rsp), %r12 + movq TAB+NUMSIZE+40(%rsp), %r13 + movq TAB+NUMSIZE+48(%rsp), %r14 + movq TAB+NUMSIZE+56(%rsp), %r15 + movq TAB+NUMSIZE+64(%rsp), %rax + + movq %r8, %rbx + movq %r12, %rbp + orq %r9, %rbx + orq %r13, %rbp + orq %r10, %rbx + orq %r14, %rbp + orq %r11, %rbx + orq %r15, %rbp + orq %rbp, %rbx + orq %rax, %rbx + cmovzq %rbx, %rcx + + xorq %rcx, %r8 + xorq %rcx, %r9 + xorq %rcx, %r10 + xorq %rcx, %r11 + xorq %rcx, %r12 + xorq %rcx, %r13 + xorq %rcx, %r14 + xorq %rcx, %r15 + andq $0x1FF, %rcx + xorq %rcx, %rax + + movq %r8, TAB+NUMSIZE(%rsp) + movq %r9, TAB+NUMSIZE+8(%rsp) + movq %r10, TAB+NUMSIZE+16(%rsp) + movq %r11, TAB+NUMSIZE+24(%rsp) + movq %r12, TAB+NUMSIZE+32(%rsp) + movq %r13, TAB+NUMSIZE+40(%rsp) + movq %r14, TAB+NUMSIZE+48(%rsp) + movq %r15, TAB+NUMSIZE+56(%rsp) + movq %rax, TAB+NUMSIZE+64(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[15] = 16 * P + + leaq TAB+JACSIZE*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*2(%rsp), %rdi + leaq TAB+JACSIZE*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*3(%rsp), %rdi + leaq TAB+JACSIZE*1(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*4(%rsp), %rdi + leaq TAB+JACSIZE*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*5(%rsp), %rdi + leaq TAB+JACSIZE*2(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*6(%rsp), %rdi + leaq TAB+JACSIZE*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*7(%rsp), %rdi + leaq TAB+JACSIZE*3(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*8(%rsp), %rdi + leaq TAB+JACSIZE*7(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*9(%rsp), %rdi + leaq TAB+JACSIZE*4(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*10(%rsp), %rdi + leaq TAB+JACSIZE*9(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*11(%rsp), %rdi + leaq TAB+JACSIZE*5(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*12(%rsp), %rdi + leaq TAB+JACSIZE*11(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*13(%rsp), %rdi + leaq TAB+JACSIZE*6(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + + leaq TAB+JACSIZE*14(%rsp), %rdi + leaq TAB+JACSIZE*13(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq p521_jscalarmul_alt_jadd + + leaq TAB+JACSIZE*15(%rsp), %rdi + leaq TAB+JACSIZE*7(%rsp), %rsi + callq p521_jscalarmul_alt_jdouble + +// Add the recoding constant sum_i(16 * 32^i) to the scalar to allow signed +// digits. The digits of the constant, in lowest-to-highest order, are as +// follows; they are generated dynamically to use fewer large constant loads. +// +// 0x0842108421084210 %rax +// 0x1084210842108421 %rbx +// 0x2108421084210842 %rbx<<1 +// 0x4210842108421084 %rbx<<2 +// 0x8421084210842108 %rbx<<3 +// 0x0842108421084210 %rax +// 0x1084210842108421 %rbx +// 0x2108421084210842 %rbx<<1 +// 0x0000000000000084 + + movq $0x1084210842108421, %rax + movq %rax, %rbx + shrq $1, %rax + movq SCALARB(%rsp), %r8 + addq %rax, %r8 + movq SCALARB+8(%rsp), %r9 + adcq %rbx, %r9 + leaq (%rbx,%rbx), %rcx + movq SCALARB+16(%rsp), %r10 + adcq %rcx, %r10 + leaq (%rcx,%rcx), %rcx + movq SCALARB+24(%rsp), %r11 + adcq %rcx, %r11 + leaq (%rcx,%rcx), %rcx + movq SCALARB+32(%rsp), %r12 + adcq %rcx, %r12 + movq SCALARB+40(%rsp), %r13 + adcq %rax, %r13 + movq SCALARB+48(%rsp), %r14 + adcq %rbx, %r14 + movq SCALARB+56(%rsp), %r15 + leaq (%rbx,%rbx), %rcx + adcq %rcx, %r15 + movq SCALARB+64(%rsp), %rax + adcq $0x84, %rax + +// Because of the initial reduction the top bitfield (>= bits 520) is <= 1, +// i.e. just a single bit. Record that in %rdi, then shift the whole +// scalar left 56 bits to align the top of the next bitfield with the MSB +// (bits 571..575). + + movq %rax, %rdi + shrq $8, %rdi + shldq $56, %r15, %rax + shldq $56, %r14, %r15 + shldq $56, %r13, %r14 + shldq $56, %r12, %r13 + shldq $56, %r11, %r12 + shldq $56, %r10, %r11 + shldq $56, %r9, %r10 + shldq $56, %r8, %r9 + shlq $56, %r8 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + +// According to the top bit, initialize the accumulator to P or 0. This top +// digit, uniquely, is not recoded so there is no sign adjustment to make. +// We only really need to adjust the z coordinate to zero, but do all three. + + xorl %ecx, %ecx + testq %rdi, %rdi + + movq TAB(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC(%rsp) + movq TAB+8(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+8(%rsp) + movq TAB+16(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+16(%rsp) + movq TAB+24(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+24(%rsp) + movq TAB+32(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+32(%rsp) + movq TAB+40(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+40(%rsp) + movq TAB+48(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+48(%rsp) + movq TAB+56(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+56(%rsp) + movq TAB+64(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+64(%rsp) + movq TAB+72(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+72(%rsp) + movq TAB+80(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+80(%rsp) + movq TAB+88(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+88(%rsp) + movq TAB+96(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+96(%rsp) + movq TAB+104(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+104(%rsp) + movq TAB+112(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+112(%rsp) + movq TAB+120(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+120(%rsp) + movq TAB+128(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+128(%rsp) + movq TAB+136(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+136(%rsp) + movq TAB+144(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+144(%rsp) + movq TAB+152(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+152(%rsp) + movq TAB+160(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+160(%rsp) + movq TAB+168(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+168(%rsp) + movq TAB+176(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+176(%rsp) + movq TAB+184(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+184(%rsp) + movq TAB+192(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+192(%rsp) + movq TAB+200(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+200(%rsp) + movq TAB+208(%rsp), %rax + cmovzq %rcx, %rax + movq %rax, ACC+208(%rsp) + +// Main loop over size-5 bitfields: double 5 times then add signed digit +// At each stage we shift the scalar left by 5 bits so we can simply pick +// the top 5 bits as the bitfield, saving some fiddle over indexing. + + movl $520, %ebp + +p521_jscalarmul_alt_mainloop: + subq $5, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jdouble + +// Choose the bitfield and adjust it to sign and magnitude + + movq SCALARB(%rsp), %r8 + movq SCALARB+8(%rsp), %r9 + movq SCALARB+16(%rsp), %r10 + movq SCALARB+24(%rsp), %r11 + movq SCALARB+32(%rsp), %r12 + movq SCALARB+40(%rsp), %r13 + movq SCALARB+48(%rsp), %r14 + movq SCALARB+56(%rsp), %r15 + movq SCALARB+64(%rsp), %rax + + + movq %rax, %rdi + shrq $59, %rdi + + shldq $5, %r15, %rax + shldq $5, %r14, %r15 + shldq $5, %r13, %r14 + shldq $5, %r12, %r13 + shldq $5, %r11, %r12 + shldq $5, %r10, %r11 + shldq $5, %r9, %r10 + shldq $5, %r8, %r9 + shlq $5, %r8 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + movq %r12, SCALARB+32(%rsp) + movq %r13, SCALARB+40(%rsp) + movq %r14, SCALARB+48(%rsp) + movq %r15, SCALARB+56(%rsp) + movq %rax, SCALARB+64(%rsp) + + subq $16, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time +// Again, this is done in separate sweeps per coordinate, doing y last. + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,0) + selectblock(2,0) + selectblock(3,0) + selectblock(4,0) + selectblock(5,0) + selectblock(6,0) + selectblock(7,0) + selectblock(8,0) + selectblock(9,0) + selectblock(10,0) + selectblock(11,0) + selectblock(12,0) + selectblock(13,0) + selectblock(14,0) + selectblock(15,0) + selectblock(16,0) + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + movq %r12, TABENT+64(%rsp) + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,2) + selectblock(2,2) + selectblock(3,2) + selectblock(4,2) + selectblock(5,2) + selectblock(6,2) + selectblock(7,2) + selectblock(8,2) + selectblock(9,2) + selectblock(10,2) + selectblock(11,2) + selectblock(12,2) + selectblock(13,2) + selectblock(14,2) + selectblock(15,2) + selectblock(16,2) + movq %rax, TABENT+2*NUMSIZE(%rsp) + movq %rbx, TABENT+2*NUMSIZE+8(%rsp) + movq %rcx, TABENT+2*NUMSIZE+16(%rsp) + movq %rdx, TABENT+2*NUMSIZE+24(%rsp) + movq %r8, TABENT+2*NUMSIZE+32(%rsp) + movq %r9, TABENT+2*NUMSIZE+40(%rsp) + movq %r10, TABENT+2*NUMSIZE+48(%rsp) + movq %r11, TABENT+2*NUMSIZE+56(%rsp) + movq %r12, TABENT+2*NUMSIZE+64(%rsp) + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + selectblock(1,1) + selectblock(2,1) + selectblock(3,1) + selectblock(4,1) + selectblock(5,1) + selectblock(6,1) + selectblock(7,1) + selectblock(8,1) + selectblock(9,1) + selectblock(10,1) + selectblock(11,1) + selectblock(12,1) + selectblock(13,1) + selectblock(14,1) + selectblock(15,1) + selectblock(16,1) + +// Store it to "tabent" with the y coordinate optionally negated. +// This is done carefully to give coordinates < p_521 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, %r13 + orq %rbx, %r13 + movq %rcx, %r14 + orq %rdx, %r14 + movq %r8, %r15 + orq %r9, %r15 + movq %r10, %rdi + orq %r11, %rdi + orq %r14, %r13 + orq %rdi, %r15 + orq %r12, %r15 + orq %r15, %r13 + cmovzq %r13, %rsi + + xorq %rsi, %rax + xorq %rsi, %rbx + xorq %rsi, %rcx + xorq %rsi, %rdx + xorq %rsi, %r8 + xorq %rsi, %r9 + xorq %rsi, %r10 + xorq %rsi, %r11 + andq $0x1FF, %rsi + xorq %rsi, %r12 + + movq %rax, TABENT+NUMSIZE(%rsp) + movq %rbx, TABENT+NUMSIZE+8(%rsp) + movq %rcx, TABENT+NUMSIZE+16(%rsp) + movq %rdx, TABENT+NUMSIZE+24(%rsp) + movq %r8, TABENT+NUMSIZE+32(%rsp) + movq %r9, TABENT+NUMSIZE+40(%rsp) + movq %r10, TABENT+NUMSIZE+48(%rsp) + movq %r11, TABENT+NUMSIZE+56(%rsp) + movq %r12, TABENT+NUMSIZE+64(%rsp) + +// Add to the accumulator + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq p521_jscalarmul_alt_jadd + + testq %rbp, %rbp + jne p521_jscalarmul_alt_mainloop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + movq ACC+96(%rsp), %rax + movq %rax, 96(%rdi) + movq ACC+104(%rsp), %rax + movq %rax, 104(%rdi) + movq ACC+112(%rsp), %rax + movq %rax, 112(%rdi) + movq ACC+120(%rsp), %rax + movq %rax, 120(%rdi) + movq ACC+128(%rsp), %rax + movq %rax, 128(%rdi) + movq ACC+136(%rsp), %rax + movq %rax, 136(%rdi) + movq ACC+144(%rsp), %rax + movq %rax, 144(%rdi) + movq ACC+152(%rsp), %rax + movq %rax, 152(%rdi) + movq ACC+160(%rsp), %rax + movq %rax, 160(%rdi) + movq ACC+168(%rsp), %rax + movq %rax, 168(%rdi) + movq ACC+176(%rsp), %rax + movq %rax, 176(%rdi) + movq ACC+184(%rsp), %rax + movq %rax, 184(%rdi) + movq ACC+192(%rsp), %rax + movq %rax, 192(%rdi) + movq ACC+200(%rsp), %rax + movq %rax, 200(%rdi) + movq ACC+208(%rsp), %rax + movq %rax, 208(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +p521_jscalarmul_alt_bignum_mod_p521_9: + pushq %rbx + movq 0x40(%rsi), %rax + movl $0x1ff, %edx + andq %rax, %rdx + shrq $0x9, %rax + stc + adcq (%rsi), %rax + movq 0x8(%rsi), %rcx + adcq $0x0, %rcx + movq 0x10(%rsi), %r8 + adcq $0x0, %r8 + movq 0x18(%rsi), %r9 + adcq $0x0, %r9 + movq 0x20(%rsi), %r10 + adcq $0x0, %r10 + movq 0x28(%rsi), %r11 + adcq $0x0, %r11 + movq 0x30(%rsi), %rbx + adcq $0x0, %rbx + movq 0x38(%rsi), %rsi + adcq $0x0, %rsi + adcq $0x0, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, (%rdi) + sbbq $0x0, %rcx + movq %rcx, 0x8(%rdi) + sbbq $0x0, %r8 + movq %r8, 0x10(%rdi) + sbbq $0x0, %r9 + movq %r9, 0x18(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x20(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x28(%rdi) + sbbq $0x0, %rbx + movq %rbx, 0x30(%rdi) + sbbq $0x0, %rsi + movq %rsi, 0x38(%rdi) + sbbq $0x0, %rdx + andq $0x1ff, %rdx + movq %rdx, 0x40(%rdi) + popq %rbx + ret + +p521_jscalarmul_alt_bignum_mod_n521_9: + movq 0x40(%rsi), %rcx + movq $0xfffffffffffffe00, %rax + orq %rcx, %rax + movq %rax, 0x40(%rdi) + shrq $0x9, %rcx + addq $0x1, %rcx + movq $0x449048e16ec79bf7, %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq $0xc44a36477663b851, %rax + mulq %rcx + xorq %r10, %r10 + addq %rax, %r9 + adcq %rdx, %r10 + movq $0x8033feb708f65a2f, %rax + mulq %rcx + xorq %r11, %r11 + addq %rax, %r10 + adcq %rdx, %r11 + movq $0xae79787c40d06994, %rax + mulq %rcx + imulq $0x5, %rcx, %rcx + addq %rax, %r11 + adcq %rdx, %rcx + sbbq %rdx, %rdx + negq %rdx + xorl %eax, %eax + addq (%rsi), %r8 + movq %r8, (%rdi) + adcq 0x8(%rsi), %r9 + movq %r9, 0x8(%rdi) + adcq 0x10(%rsi), %r10 + movq %r10, 0x10(%rdi) + adcq 0x18(%rsi), %r11 + movq %r11, 0x18(%rdi) + adcq 0x20(%rsi), %rcx + movq %rcx, 0x20(%rdi) + adcq 0x28(%rsi), %rdx + movq %rdx, 0x28(%rdi) + movq 0x30(%rsi), %rdx + adcq %rax, %rdx + movq %rdx, 0x30(%rdi) + movq 0x38(%rsi), %rdx + adcq %rax, %rdx + movq %rdx, 0x38(%rdi) + movq 0x40(%rdi), %rcx + adcq %rax, %rcx + cmc + sbbq %rdx, %rdx + movq $0x449048e16ec79bf7, %r8 + andq %rdx, %r8 + movq $0xc44a36477663b851, %r9 + andq %rdx, %r9 + movq $0x8033feb708f65a2f, %r10 + andq %rdx, %r10 + movq $0xae79787c40d06994, %r11 + andq %rdx, %r11 + andq $0x5, %rdx + subq %r8, (%rdi) + sbbq %r9, 0x8(%rdi) + sbbq %r10, 0x10(%rdi) + sbbq %r11, 0x18(%rdi) + sbbq %rdx, 0x20(%rdi) + sbbq %rax, 0x28(%rdi) + sbbq %rax, 0x30(%rdi) + sbbq %rax, 0x38(%rdi) + sbbl %eax, %ecx + andl $0x1ff, %ecx + movq %rcx, 0x40(%rdi) + ret + +p521_jscalarmul_alt_jadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x210, %rsp + movq %rdi, 0x1f8(%rsp) + movq %rsi, 0x200(%rsp) + movq %rdx, 0x208(%rsp) + movq 0x200(%rsp), %rsi + leaq 0x90(%rsi), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x208(%rsp), %rdi + leaq 0x90(%rdi), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x200(%rsp), %rsi + movq 0x208(%rsp), %rdi + leaq 0x48(%rsi), %rdx + leaq 0x90(%rdi), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x200(%rsp), %rsi + movq 0x208(%rsp), %rdi + leaq 0x48(%rdi), %rdx + leaq 0x90(%rsi), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x208(%rsp), %rdi + leaq (%rdi), %rdx + leaq (%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x200(%rsp), %rsi + leaq (%rsi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + leaq 0x48(%rsp), %rdx + leaq (%rsp), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + leaq 0x1b0(%rsp), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x90(%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x98(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0xa0(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0xa8(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0xb0(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0xb8(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0xc0(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0xc8(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0xd0(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x1a8(%rsp) + movq 0x48(%rsp), %rax + subq 0x1b0(%rsp), %rax + movq 0x50(%rsp), %rdx + sbbq 0x1b8(%rsp), %rdx + movq 0x58(%rsp), %r8 + sbbq 0x1c0(%rsp), %r8 + movq 0x60(%rsp), %r9 + sbbq 0x1c8(%rsp), %r9 + movq 0x68(%rsp), %r10 + sbbq 0x1d0(%rsp), %r10 + movq 0x70(%rsp), %r11 + sbbq 0x1d8(%rsp), %r11 + movq 0x78(%rsp), %r12 + sbbq 0x1e0(%rsp), %r12 + movq 0x80(%rsp), %r13 + sbbq 0x1e8(%rsp), %r13 + movq 0x88(%rsp), %r14 + sbbq 0x1f0(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x48(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x50(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x58(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x60(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x68(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x70(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x78(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x80(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x88(%rsp) + leaq 0x168(%rsp), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + leaq 0x48(%rsp), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + leaq 0x120(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + leaq 0x90(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq (%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x8(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0x10(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0x20(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0x28(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0x30(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0x38(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0x40(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, (%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x8(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x10(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x18(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x20(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x28(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x30(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x38(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x40(%rsp) + movq 0x90(%rsp), %rax + subq 0x120(%rsp), %rax + movq 0x98(%rsp), %rdx + sbbq 0x128(%rsp), %rdx + movq 0xa0(%rsp), %r8 + sbbq 0x130(%rsp), %r8 + movq 0xa8(%rsp), %r9 + sbbq 0x138(%rsp), %r9 + movq 0xb0(%rsp), %r10 + sbbq 0x140(%rsp), %r10 + movq 0xb8(%rsp), %r11 + sbbq 0x148(%rsp), %r11 + movq 0xc0(%rsp), %r12 + sbbq 0x150(%rsp), %r12 + movq 0xc8(%rsp), %r13 + sbbq 0x158(%rsp), %r13 + movq 0xd0(%rsp), %r14 + sbbq 0x160(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0xd8(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0xe0(%rsp) + sbbq $0x0, %r8 + movq %r8, 0xe8(%rsp) + sbbq $0x0, %r9 + movq %r9, 0xf0(%rsp) + sbbq $0x0, %r10 + movq %r10, 0xf8(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x100(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x108(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x110(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x118(%rsp) + movq 0x200(%rsp), %rsi + leaq 0x90(%rsi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq (%rsp), %rax + subq 0x90(%rsp), %rax + movq 0x8(%rsp), %rdx + sbbq 0x98(%rsp), %rdx + movq 0x10(%rsp), %r8 + sbbq 0xa0(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0xa8(%rsp), %r9 + movq 0x20(%rsp), %r10 + sbbq 0xb0(%rsp), %r10 + movq 0x28(%rsp), %r11 + sbbq 0xb8(%rsp), %r11 + movq 0x30(%rsp), %r12 + sbbq 0xc0(%rsp), %r12 + movq 0x38(%rsp), %r13 + sbbq 0xc8(%rsp), %r13 + movq 0x40(%rsp), %r14 + sbbq 0xd0(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, (%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x8(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x10(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x18(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x20(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x28(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x30(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x38(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x40(%rsp) + movq 0x120(%rsp), %rax + subq (%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + leaq 0x1b0(%rsp), %rdx + leaq 0xd8(%rsp), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x208(%rsp), %rdi + leaq 0x90(%rdi), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + leaq 0x120(%rsp), %rdx + leaq 0x48(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x120(%rsp), %rax + subq 0xd8(%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0xe0(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0xe8(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0xf0(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0xf8(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x100(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x108(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x110(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x118(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + movq 0x200(%rsp), %rsi + movq 0x90(%rsi), %r8 + movq 0x98(%rsi), %r9 + movq 0xa0(%rsi), %r10 + movq 0xa8(%rsi), %r11 + movq 0xb0(%rsi), %r12 + movq 0xb8(%rsi), %r13 + movq 0xc0(%rsi), %r14 + movq 0xc8(%rsi), %r15 + movq 0xd0(%rsi), %rbp + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rax, %rax + movq 0x208(%rsp), %rdi + movq 0x90(%rdi), %r8 + movq 0x98(%rdi), %r9 + movq 0xa0(%rdi), %r10 + movq 0xa8(%rdi), %r11 + movq 0xb0(%rdi), %r12 + movq 0xb8(%rdi), %r13 + movq 0xc0(%rdi), %r14 + movq 0xc8(%rdi), %r15 + movq 0xd0(%rdi), %rbp + orq %r9, %r8 + orq %r11, %r10 + orq %r13, %r12 + orq %r15, %r14 + orq %r10, %r8 + orq %r14, %r12 + orq %rbp, %r8 + orq %r12, %r8 + negq %r8 + sbbq %rdx, %rdx + cmpq %rax, %rdx + movq 0x120(%rsp), %r8 + cmovbq 0x48(%rsi), %r8 + cmova 0x48(%rdi), %r8 + movq 0x128(%rsp), %r9 + cmovbq 0x50(%rsi), %r9 + cmova 0x50(%rdi), %r9 + movq 0x130(%rsp), %r10 + cmovbq 0x58(%rsi), %r10 + cmova 0x58(%rdi), %r10 + movq 0x138(%rsp), %r11 + cmovbq 0x60(%rsi), %r11 + cmova 0x60(%rdi), %r11 + movq 0x140(%rsp), %r12 + cmovbq 0x68(%rsi), %r12 + cmova 0x68(%rdi), %r12 + movq 0x148(%rsp), %r13 + cmovbq 0x70(%rsi), %r13 + cmova 0x70(%rdi), %r13 + movq 0x150(%rsp), %r14 + cmovbq 0x78(%rsi), %r14 + cmova 0x78(%rdi), %r14 + movq 0x158(%rsp), %r15 + cmovbq 0x80(%rsi), %r15 + cmova 0x80(%rdi), %r15 + movq 0x160(%rsp), %rbp + cmovbq 0x88(%rsi), %rbp + cmova 0x88(%rdi), %rbp + movq %r8, 0x120(%rsp) + movq %r9, 0x128(%rsp) + movq %r10, 0x130(%rsp) + movq %r11, 0x138(%rsp) + movq %r12, 0x140(%rsp) + movq %r13, 0x148(%rsp) + movq %r14, 0x150(%rsp) + movq %r15, 0x158(%rsp) + movq %rbp, 0x160(%rsp) + movq 0x168(%rsp), %r8 + cmovbq 0x90(%rsi), %r8 + cmova 0x90(%rdi), %r8 + movq 0x170(%rsp), %r9 + cmovbq 0x98(%rsi), %r9 + cmova 0x98(%rdi), %r9 + movq 0x178(%rsp), %r10 + cmovbq 0xa0(%rsi), %r10 + cmova 0xa0(%rdi), %r10 + movq 0x180(%rsp), %r11 + cmovbq 0xa8(%rsi), %r11 + cmova 0xa8(%rdi), %r11 + movq 0x188(%rsp), %r12 + cmovbq 0xb0(%rsi), %r12 + cmova 0xb0(%rdi), %r12 + movq 0x190(%rsp), %r13 + cmovbq 0xb8(%rsi), %r13 + cmova 0xb8(%rdi), %r13 + movq 0x198(%rsp), %r14 + cmovbq 0xc0(%rsi), %r14 + cmova 0xc0(%rdi), %r14 + movq 0x1a0(%rsp), %r15 + cmovbq 0xc8(%rsi), %r15 + cmova 0xc8(%rdi), %r15 + movq 0x1a8(%rsp), %rbp + cmovbq 0xd0(%rsi), %rbp + cmova 0xd0(%rdi), %rbp + movq %r8, 0x168(%rsp) + movq %r9, 0x170(%rsp) + movq %r10, 0x178(%rsp) + movq %r11, 0x180(%rsp) + movq %r12, 0x188(%rsp) + movq %r13, 0x190(%rsp) + movq %r14, 0x198(%rsp) + movq %r15, 0x1a0(%rsp) + movq %rbp, 0x1a8(%rsp) + movq (%rsp), %r8 + cmovbq (%rsi), %r8 + cmova (%rdi), %r8 + movq 0x8(%rsp), %r9 + cmovbq 0x8(%rsi), %r9 + cmova 0x8(%rdi), %r9 + movq 0x10(%rsp), %r10 + cmovbq 0x10(%rsi), %r10 + cmova 0x10(%rdi), %r10 + movq 0x18(%rsp), %r11 + cmovbq 0x18(%rsi), %r11 + cmova 0x18(%rdi), %r11 + movq 0x20(%rsp), %r12 + cmovbq 0x20(%rsi), %r12 + cmova 0x20(%rdi), %r12 + movq 0x28(%rsp), %r13 + cmovbq 0x28(%rsi), %r13 + cmova 0x28(%rdi), %r13 + movq 0x30(%rsp), %r14 + cmovbq 0x30(%rsi), %r14 + cmova 0x30(%rdi), %r14 + movq 0x38(%rsp), %r15 + cmovbq 0x38(%rsi), %r15 + cmova 0x38(%rdi), %r15 + movq 0x40(%rsp), %rbp + cmovbq 0x40(%rsi), %rbp + cmova 0x40(%rdi), %rbp + movq 0x1f8(%rsp), %rdi + movq %r8, (%rdi) + movq %r9, 0x8(%rdi) + movq %r10, 0x10(%rdi) + movq %r11, 0x18(%rdi) + movq %r12, 0x20(%rdi) + movq %r13, 0x28(%rdi) + movq %r14, 0x30(%rdi) + movq %r15, 0x38(%rdi) + movq %rbp, 0x40(%rdi) + movq 0x120(%rsp), %rax + movq %rax, 0x48(%rdi) + movq 0x128(%rsp), %rax + movq %rax, 0x50(%rdi) + movq 0x130(%rsp), %rax + movq %rax, 0x58(%rdi) + movq 0x138(%rsp), %rax + movq %rax, 0x60(%rdi) + movq 0x140(%rsp), %rax + movq %rax, 0x68(%rdi) + movq 0x148(%rsp), %rax + movq %rax, 0x70(%rdi) + movq 0x150(%rsp), %rax + movq %rax, 0x78(%rdi) + movq 0x158(%rsp), %rax + movq %rax, 0x80(%rdi) + movq 0x160(%rsp), %rax + movq %rax, 0x88(%rdi) + movq 0x168(%rsp), %rax + movq %rax, 0x90(%rdi) + movq 0x170(%rsp), %rax + movq %rax, 0x98(%rdi) + movq 0x178(%rsp), %rax + movq %rax, 0xa0(%rdi) + movq 0x180(%rsp), %rax + movq %rax, 0xa8(%rdi) + movq 0x188(%rsp), %rax + movq %rax, 0xb0(%rdi) + movq 0x190(%rsp), %rax + movq %rax, 0xb8(%rdi) + movq 0x198(%rsp), %rax + movq %rax, 0xc0(%rdi) + movq 0x1a0(%rsp), %rax + movq %rax, 0xc8(%rdi) + movq 0x1a8(%rsp), %rax + movq %rax, 0xd0(%rdi) + addq $0x210, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p521_jscalarmul_alt_jdouble: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0x208, %rsp + movq %rdi, 0x1f8(%rsp) + movq %rsi, 0x200(%rsp) + movq 0x200(%rsp), %rdi + leaq 0x90(%rdi), %rsi + leaq (%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x200(%rsp), %rdi + leaq 0x48(%rdi), %rsi + leaq 0x48(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x200(%rsp), %rdi + stc + movq (%rdi), %rax + adcq (%rsp), %rax + movq 0x8(%rdi), %rbx + adcq 0x8(%rsp), %rbx + movq 0x10(%rdi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rdi), %r9 + adcq 0x18(%rsp), %r9 + movq 0x20(%rdi), %r10 + adcq 0x20(%rsp), %r10 + movq 0x28(%rdi), %r11 + adcq 0x28(%rsp), %r11 + movq 0x30(%rdi), %r12 + adcq 0x30(%rsp), %r12 + movq 0x38(%rdi), %r13 + adcq 0x38(%rsp), %r13 + movq 0x40(%rdi), %r14 + adcq 0x40(%rsp), %r14 + movq $0x200, %rdx + andq %r14, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rbx + movq %rbx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq %rdx, %r14 + movq %r14, 0x1a8(%rsp) + movq 0x200(%rsp), %rdi + movq (%rdi), %rax + subq (%rsp), %rax + movq 0x8(%rdi), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x10(%rdi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rdi), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x20(%rdi), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x28(%rdi), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x30(%rdi), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x38(%rdi), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x40(%rdi), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x120(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x128(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x130(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x138(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x140(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x148(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x150(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x158(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x160(%rsp) + leaq 0x120(%rsp), %rdx + leaq 0x168(%rsp), %rsi + leaq 0x90(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x200(%rsp), %rdi + stc + movq 0x48(%rdi), %rax + adcq 0x90(%rdi), %rax + movq 0x50(%rdi), %rbx + adcq 0x98(%rdi), %rbx + movq 0x58(%rdi), %r8 + adcq 0xa0(%rdi), %r8 + movq 0x60(%rdi), %r9 + adcq 0xa8(%rdi), %r9 + movq 0x68(%rdi), %r10 + adcq 0xb0(%rdi), %r10 + movq 0x70(%rdi), %r11 + adcq 0xb8(%rdi), %r11 + movq 0x78(%rdi), %r12 + adcq 0xc0(%rdi), %r12 + movq 0x80(%rdi), %r13 + adcq 0xc8(%rdi), %r13 + movq 0x88(%rdi), %r14 + adcq 0xd0(%rdi), %r14 + movq $0x200, %rdx + andq %r14, %rdx + cmpq $0x200, %rdx + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rbx + movq %rbx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq %rdx, %r14 + movq %r14, 0x1a8(%rsp) + leaq 0x90(%rsp), %rsi + leaq 0x1b0(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x200(%rsp), %rdi + leaq 0x48(%rsp), %rdx + leaq (%rdi), %rsi + leaq 0xd8(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + leaq 0x168(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq $0x9, %rcx + movq 0x1b0(%rsp), %rax + notq %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x1b8(%rsp), %rax + notq %rax + mulq %rcx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x1c0(%rsp), %rax + notq %rax + mulq %rcx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x1c8(%rsp), %rax + notq %rax + mulq %rcx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x1d0(%rsp), %rax + notq %rax + mulq %rcx + xorl %r13d, %r13d + addq %rax, %r12 + adcq %rdx, %r13 + movq 0x1d8(%rsp), %rax + notq %rax + mulq %rcx + xorl %r14d, %r14d + addq %rax, %r13 + adcq %rdx, %r14 + movq 0x1e0(%rsp), %rax + notq %rax + mulq %rcx + xorl %r15d, %r15d + addq %rax, %r14 + adcq %rdx, %r15 + movq 0x1e8(%rsp), %rax + notq %rax + mulq %rcx + xorl %ebx, %ebx + addq %rax, %r15 + adcq %rdx, %rbx + movq 0x1f0(%rsp), %rax + xorq $0x1ff, %rax + imulq %rcx, %rax + addq %rax, %rbx + xorl %eax, %eax + movl $0xc, %ecx + movq 0xd8(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbp, %rbp + movq 0xe0(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbp, %rbp + movq 0xe8(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbp, %rbp + movq 0xf0(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rbp, %rbp + movq 0xf8(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rbp, %rbp + movq 0x100(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rbp, %rbp + movq 0x108(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + sbbq %rbp, %rbp + movq 0x110(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r15 + adcq %rdx, %rbx + movq 0x118(%rsp), %rax + imulq %rcx, %rax + addq %rax, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, 0x1b0(%rsp) + adcq %rcx, %r9 + movq %r9, 0x1b8(%rsp) + adcq %rcx, %r10 + movq %r10, 0x1c0(%rsp) + adcq %rcx, %r11 + movq %r11, 0x1c8(%rsp) + adcq %rcx, %r12 + movq %r12, 0x1d0(%rsp) + adcq %rcx, %r13 + movq %r13, 0x1d8(%rsp) + adcq %rcx, %r14 + movq %r14, 0x1e0(%rsp) + adcq %rcx, %r15 + movq %r15, 0x1e8(%rsp) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x1f0(%rsp) + movq 0x120(%rsp), %rax + subq (%rsp), %rax + movq 0x128(%rsp), %rdx + sbbq 0x8(%rsp), %rdx + movq 0x130(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x138(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movq 0x140(%rsp), %r10 + sbbq 0x20(%rsp), %r10 + movq 0x148(%rsp), %r11 + sbbq 0x28(%rsp), %r11 + movq 0x150(%rsp), %r12 + sbbq 0x30(%rsp), %r12 + movq 0x158(%rsp), %r13 + sbbq 0x38(%rsp), %r13 + movq 0x160(%rsp), %r14 + sbbq 0x40(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x168(%rsp) + sbbq $0x0, %rdx + movq %rdx, 0x170(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x178(%rsp) + sbbq $0x0, %r9 + movq %r9, 0x180(%rsp) + sbbq $0x0, %r10 + movq %r10, 0x188(%rsp) + sbbq $0x0, %r11 + movq %r11, 0x190(%rsp) + sbbq $0x0, %r12 + movq %r12, 0x198(%rsp) + sbbq $0x0, %r13 + movq %r13, 0x1a0(%rsp) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0x1a8(%rsp) + leaq 0x48(%rsp), %rsi + leaq 0x120(%rsp), %rdi + callq p521_jscalarmul_alt_sqr_p521 + movq 0x1f8(%rsp), %rdi + movq 0x168(%rsp), %rax + subq 0x48(%rsp), %rax + movq 0x170(%rsp), %rdx + sbbq 0x50(%rsp), %rdx + movq 0x178(%rsp), %r8 + sbbq 0x58(%rsp), %r8 + movq 0x180(%rsp), %r9 + sbbq 0x60(%rsp), %r9 + movq 0x188(%rsp), %r10 + sbbq 0x68(%rsp), %r10 + movq 0x190(%rsp), %r11 + sbbq 0x70(%rsp), %r11 + movq 0x198(%rsp), %r12 + sbbq 0x78(%rsp), %r12 + movq 0x1a0(%rsp), %r13 + sbbq 0x80(%rsp), %r13 + movq 0x1a8(%rsp), %r14 + sbbq 0x88(%rsp), %r14 + sbbq $0x0, %rax + movq %rax, 0x90(%rdi) + sbbq $0x0, %rdx + movq %rdx, 0x98(%rdi) + sbbq $0x0, %r8 + movq %r8, 0xa0(%rdi) + sbbq $0x0, %r9 + movq %r9, 0xa8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0xb0(%rdi) + sbbq $0x0, %r11 + movq %r11, 0xb8(%rdi) + sbbq $0x0, %r12 + movq %r12, 0xc0(%rdi) + sbbq $0x0, %r13 + movq %r13, 0xc8(%rdi) + sbbq $0x0, %r14 + andq $0x1ff, %r14 + movq %r14, 0xd0(%rdi) + leaq 0x90(%rsp), %rdx + leaq 0x1b0(%rsp), %rsi + leaq 0x168(%rsp), %rdi + callq p521_jscalarmul_alt_mul_p521 + movq 0x1f8(%rsp), %rdi + movq 0x118(%rsp), %rbx + movq 0x110(%rsp), %r15 + shldq $0x2, %r15, %rbx + movq 0x108(%rsp), %r14 + shldq $0x2, %r14, %r15 + movq 0x100(%rsp), %r13 + shldq $0x2, %r13, %r14 + movq 0xf8(%rsp), %r12 + shldq $0x2, %r12, %r13 + movq 0xf0(%rsp), %r11 + shldq $0x2, %r11, %r12 + movq 0xe8(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0xe0(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0xd8(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + movq 0x1f0(%rsp), %rcx + xorq $0x1ff, %rcx + movq 0x1b0(%rsp), %rax + notq %rax + addq %rax, %r8 + movq 0x1b8(%rsp), %rax + notq %rax + adcq %rax, %r9 + movq 0x1c0(%rsp), %rax + notq %rax + adcq %rax, %r10 + movq 0x1c8(%rsp), %rax + notq %rax + adcq %rax, %r11 + movq 0x1d0(%rsp), %rax + notq %rax + adcq %rax, %r12 + movq 0x1d8(%rsp), %rax + notq %rax + adcq %rax, %r13 + movq 0x1e0(%rsp), %rax + notq %rax + adcq %rax, %r14 + movq 0x1e8(%rsp), %rax + notq %rax + adcq %rax, %r15 + adcq %rcx, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, (%rdi) + adcq %rcx, %r9 + movq %r9, 0x8(%rdi) + adcq %rcx, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + adcq %rcx, %r12 + movq %r12, 0x20(%rdi) + adcq %rcx, %r13 + movq %r13, 0x28(%rdi) + adcq %rcx, %r14 + movq %r14, 0x30(%rdi) + adcq %rcx, %r15 + movq %r15, 0x38(%rdi) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x40(%rdi) + movq 0x1f8(%rsp), %rdi + movq 0x160(%rsp), %rbx + xorq $0x1ff, %rbx + movq 0x158(%rsp), %r15 + notq %r15 + shldq $0x3, %r15, %rbx + movq 0x150(%rsp), %r14 + notq %r14 + shldq $0x3, %r14, %r15 + movq 0x148(%rsp), %r13 + notq %r13 + shldq $0x3, %r13, %r14 + movq 0x140(%rsp), %r12 + notq %r12 + shldq $0x3, %r12, %r13 + movq 0x138(%rsp), %r11 + notq %r11 + shldq $0x3, %r11, %r12 + movq 0x130(%rsp), %r10 + notq %r10 + shldq $0x3, %r10, %r11 + movq 0x128(%rsp), %r9 + notq %r9 + shldq $0x3, %r9, %r10 + movq 0x120(%rsp), %r8 + notq %r8 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + movl $0x3, %ecx + movq 0x168(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbp, %rbp + movq 0x170(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbp, %rbp + movq 0x178(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbp, %rbp + movq 0x180(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rbp, %rbp + movq 0x188(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rbp, %rbp + movq 0x190(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rbp, %rbp + movq 0x198(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + sbbq %rbp, %rbp + movq 0x1a0(%rsp), %rax + mulq %rcx + subq %rbp, %rdx + addq %rax, %r15 + adcq %rdx, %rbx + movq 0x1a8(%rsp), %rax + imulq %rcx, %rax + addq %rax, %rbx + movq %r9, %rax + andq %r10, %rax + andq %r11, %rax + andq %r12, %rax + andq %r13, %rax + andq %r14, %rax + andq %r15, %rax + movq %rbx, %rdx + shrq $0x9, %rdx + orq $0xfffffffffffffe00, %rbx + leaq 0x1(%rdx), %rcx + addq %r8, %rcx + movl $0x0, %ecx + adcq %rcx, %rax + movq %rbx, %rax + adcq %rcx, %rax + adcq %rdx, %r8 + movq %r8, 0x48(%rdi) + adcq %rcx, %r9 + movq %r9, 0x50(%rdi) + adcq %rcx, %r10 + movq %r10, 0x58(%rdi) + adcq %rcx, %r11 + movq %r11, 0x60(%rdi) + adcq %rcx, %r12 + movq %r12, 0x68(%rdi) + adcq %rcx, %r13 + movq %r13, 0x70(%rdi) + adcq %rcx, %r14 + movq %r14, 0x78(%rdi) + adcq %rcx, %r15 + movq %r15, 0x80(%rdi) + adcq %rcx, %rbx + andq $0x1ff, %rbx + movq %rbx, 0x88(%rdi) + addq $0x208, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +p521_jscalarmul_alt_mul_p521: + subq $0x48, %rsp + movq %rdx, %rcx + movq (%rsi), %rax + mulq (%rcx) + movq %rax, (%rsp) + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x8(%rsi), %rax + mulq (%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + movq %r9, 0x8(%rsp) + xorq %r12, %r12 + movq (%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x8(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x10(%rsi), %rax + mulq (%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq %r10, 0x10(%rsp) + xorq %r13, %r13 + movq (%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x8(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x10(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x18(%rsi), %rax + mulq (%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq %r11, 0x18(%rsp) + xorq %r14, %r14 + movq (%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x8(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x10(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x18(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x20(%rsi), %rax + mulq (%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq %r12, 0x20(%rsp) + xorq %r15, %r15 + movq (%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x8(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x18(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x20(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x28(%rsi), %rax + mulq (%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq %r13, 0x28(%rsp) + xorq %r8, %r8 + movq (%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + movq 0x8(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x10(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x18(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x20(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x28(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x30(%rsi), %rax + mulq (%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq %r14, 0x30(%rsp) + xorq %r9, %r9 + movq (%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq %r9, %r9 + movq 0x8(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x10(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x18(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x20(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x28(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x30(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq 0x38(%rsi), %rax + mulq (%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + adcq $0x0, %r9 + movq %r15, 0x38(%rsp) + xorq %r10, %r10 + movq (%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq %r10, %r10 + movq 0x8(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x10(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x18(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x20(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x28(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x30(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x38(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq 0x40(%rsi), %rax + mulq (%rcx) + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq %r8, 0x40(%rsp) + xorq %r11, %r11 + movq 0x8(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq %r11, %r11 + movq 0x10(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x18(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x20(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x28(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x30(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x38(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq 0x40(%rsi), %rax + mulq 0x8(%rcx) + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + xorq %r12, %r12 + movq 0x10(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq %r12, %r12 + movq 0x18(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x20(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x28(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x30(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x38(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x40(%rsi), %rax + mulq 0x10(%rcx) + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + xorq %r13, %r13 + movq 0x18(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq %r13, %r13 + movq 0x20(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x28(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x30(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x38(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + movq 0x40(%rsi), %rax + mulq 0x18(%rcx) + addq %rax, %r11 + adcq %rdx, %r12 + adcq $0x0, %r13 + xorq %r14, %r14 + movq 0x20(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x28(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x30(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x38(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x40(%rsi), %rax + mulq 0x20(%rcx) + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorq %r15, %r15 + movq 0x28(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x30(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x38(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + movq 0x40(%rsi), %rax + mulq 0x28(%rcx) + addq %rax, %r13 + adcq %rdx, %r14 + adcq $0x0, %r15 + xorq %r8, %r8 + movq 0x30(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + movq 0x38(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x40(%rsi), %rax + mulq 0x30(%rcx) + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x38(%rsi), %rax + mulq 0x40(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + movq 0x40(%rsi), %rax + mulq 0x38(%rcx) + addq %rax, %r15 + adcq %rdx, %r8 + movq 0x40(%rsi), %rax + imulq 0x40(%rcx), %rax + addq %r8, %rax + movq 0x40(%rsp), %r8 + movq %r8, %rdx + andq $0x1ff, %rdx + shrdq $0x9, %r9, %r8 + shrdq $0x9, %r10, %r9 + shrdq $0x9, %r11, %r10 + shrdq $0x9, %r12, %r11 + shrdq $0x9, %r13, %r12 + shrdq $0x9, %r14, %r13 + shrdq $0x9, %r15, %r14 + shrdq $0x9, %rax, %r15 + shrq $0x9, %rax + addq %rax, %rdx + stc + adcq (%rsp), %r8 + adcq 0x8(%rsp), %r9 + adcq 0x10(%rsp), %r10 + adcq 0x18(%rsp), %r11 + adcq 0x20(%rsp), %r12 + adcq 0x28(%rsp), %r13 + adcq 0x30(%rsp), %r14 + adcq 0x38(%rsp), %r15 + adcq $0xfffffffffffffe00, %rdx + cmc + sbbq $0x0, %r8 + movq %r8, (%rdi) + sbbq $0x0, %r9 + movq %r9, 0x8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x10(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x18(%rdi) + sbbq $0x0, %r12 + movq %r12, 0x20(%rdi) + sbbq $0x0, %r13 + movq %r13, 0x28(%rdi) + sbbq $0x0, %r14 + movq %r14, 0x30(%rdi) + sbbq $0x0, %r15 + movq %r15, 0x38(%rdi) + sbbq $0x0, %rdx + andq $0x1ff, %rdx + movq %rdx, 0x40(%rdi) + addq $0x48, %rsp + ret + +p521_jscalarmul_alt_sqr_p521: + subq $0x48, %rsp + movq (%rsi), %rax + mulq %rax + movq %rax, (%rsp) + movq %rdx, %r9 + xorq %r10, %r10 + xorq %r11, %r11 + movq (%rsi), %rax + mulq 0x8(%rsi) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r11 + addq %rax, %r9 + adcq %rdx, %r10 + adcq $0x0, %r11 + movq %r9, 0x8(%rsp) + xorq %r12, %r12 + movq 0x8(%rsi), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq (%rsi), %rax + mulq 0x10(%rsi) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r12 + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq %r10, 0x10(%rsp) + movq (%rsi), %rax + mulq 0x18(%rsi) + xorq %r13, %r13 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x10(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r13 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r13, %r13 + addq %rbx, %r11 + adcq %rcx, %r12 + adcq $0x0, %r13 + movq %r11, 0x18(%rsp) + movq (%rsi), %rax + mulq 0x20(%rsi) + xorq %r14, %r14 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x18(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r14 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r14, %r14 + addq %rbx, %r12 + adcq %rcx, %r13 + adcq $0x0, %r14 + movq 0x10(%rsi), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq %r12, 0x20(%rsp) + movq (%rsi), %rax + mulq 0x28(%rsi) + xorq %r15, %r15 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x20(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r15 + movq 0x10(%rsi), %rax + mulq 0x18(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r15 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r15, %r15 + addq %rbx, %r13 + adcq %rcx, %r14 + adcq $0x0, %r15 + movq %r13, 0x28(%rsp) + movq (%rsi), %rax + mulq 0x30(%rsi) + xorq %r8, %r8 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x28(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r8 + movq 0x10(%rsi), %rax + mulq 0x20(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r8 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r8, %r8 + addq %rbx, %r14 + adcq %rcx, %r15 + adcq $0x0, %r8 + movq 0x18(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq %r14, 0x30(%rsp) + movq (%rsi), %rax + mulq 0x38(%rsi) + xorq %r9, %r9 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x30(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r9 + movq 0x10(%rsi), %rax + mulq 0x28(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r9 + movq 0x18(%rsi), %rax + mulq 0x20(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r9 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r9, %r9 + addq %rbx, %r15 + adcq %rcx, %r8 + adcq $0x0, %r9 + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + mulq 0x40(%rsi) + xorq %r10, %r10 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x8(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r10 + movq 0x10(%rsi), %rax + mulq 0x30(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r10 + movq 0x18(%rsi), %rax + mulq 0x28(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r10 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r10, %r10 + addq %rbx, %r8 + adcq %rcx, %r9 + adcq $0x0, %r10 + movq 0x20(%rsi), %rax + mulq %rax + addq %rax, %r8 + adcq %rdx, %r9 + adcq $0x0, %r10 + movq %r8, 0x40(%rsp) + movq 0x8(%rsi), %rax + mulq 0x40(%rsi) + xorq %r11, %r11 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x10(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r11 + movq 0x18(%rsi), %rax + mulq 0x30(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r11 + movq 0x20(%rsi), %rax + mulq 0x28(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r11 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r11, %r11 + addq %rbx, %r9 + adcq %rcx, %r10 + adcq $0x0, %r11 + movq 0x10(%rsi), %rax + mulq 0x40(%rsi) + xorq %r12, %r12 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x18(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r12 + movq 0x20(%rsi), %rax + mulq 0x30(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r12 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r12, %r12 + addq %rbx, %r10 + adcq %rcx, %r11 + adcq $0x0, %r12 + movq 0x28(%rsi), %rax + mulq %rax + addq %rax, %r10 + adcq %rdx, %r11 + adcq $0x0, %r12 + movq 0x18(%rsi), %rax + mulq 0x40(%rsi) + xorq %r13, %r13 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x20(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r13 + movq 0x28(%rsi), %rax + mulq 0x30(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r13 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r13, %r13 + addq %rbx, %r11 + adcq %rcx, %r12 + adcq $0x0, %r13 + movq 0x20(%rsi), %rax + mulq 0x40(%rsi) + xorq %r14, %r14 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x28(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r14 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r14, %r14 + addq %rbx, %r12 + adcq %rcx, %r13 + adcq $0x0, %r14 + movq 0x30(%rsi), %rax + mulq %rax + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + movq 0x28(%rsi), %rax + mulq 0x40(%rsi) + xorq %r15, %r15 + movq %rax, %rbx + movq %rdx, %rcx + movq 0x30(%rsi), %rax + mulq 0x38(%rsi) + addq %rax, %rbx + adcq %rdx, %rcx + adcq $0x0, %r15 + addq %rbx, %rbx + adcq %rcx, %rcx + adcq %r15, %r15 + addq %rbx, %r13 + adcq %rcx, %r14 + adcq $0x0, %r15 + xorq %r8, %r8 + movq 0x38(%rsi), %rax + mulq %rax + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x30(%rsi), %rax + mulq 0x40(%rsi) + addq %rax, %rax + adcq %rdx, %rdx + adcq $0x0, %r8 + addq %rax, %r14 + adcq %rdx, %r15 + adcq $0x0, %r8 + movq 0x38(%rsi), %rax + mulq 0x40(%rsi) + addq %rax, %rax + adcq %rdx, %rdx + addq %rax, %r15 + adcq %rdx, %r8 + movq 0x40(%rsi), %rax + imulq %rax, %rax + addq %r8, %rax + movq 0x40(%rsp), %r8 + movq %r8, %rdx + andq $0x1ff, %rdx + shrdq $0x9, %r9, %r8 + shrdq $0x9, %r10, %r9 + shrdq $0x9, %r11, %r10 + shrdq $0x9, %r12, %r11 + shrdq $0x9, %r13, %r12 + shrdq $0x9, %r14, %r13 + shrdq $0x9, %r15, %r14 + shrdq $0x9, %rax, %r15 + shrq $0x9, %rax + addq %rax, %rdx + stc + adcq (%rsp), %r8 + adcq 0x8(%rsp), %r9 + adcq 0x10(%rsp), %r10 + adcq 0x18(%rsp), %r11 + adcq 0x20(%rsp), %r12 + adcq 0x28(%rsp), %r13 + adcq 0x30(%rsp), %r14 + adcq 0x38(%rsp), %r15 + adcq $0xfffffffffffffe00, %rdx + cmc + sbbq $0x0, %r8 + movq %r8, (%rdi) + sbbq $0x0, %r9 + movq %r9, 0x8(%rdi) + sbbq $0x0, %r10 + movq %r10, 0x10(%rdi) + sbbq $0x0, %r11 + movq %r11, 0x18(%rdi) + sbbq $0x0, %r12 + movq %r12, 0x20(%rdi) + sbbq $0x0, %r13 + movq %r13, 0x28(%rdi) + sbbq $0x0, %r14 + movq %r14, 0x30(%rdi) + sbbq $0x0, %r15 + movq %r15, 0x38(%rdi) + sbbq $0x0, %rdx + andq $0x1ff, %rdx + movq %rdx, 0x40(%rdi) + addq $0x48, %rsp + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/util/fipstools/delocate/delocate.peg b/util/fipstools/delocate/delocate.peg index b33b00b250..a1a2b997a4 100644 --- a/util/fipstools/delocate/delocate.peg +++ b/util/fipstools/delocate/delocate.peg @@ -91,7 +91,7 @@ ARMConstantTweak <- ((([us] "xt" [xwhb]) / "lsl" / "lsr" / "ror" / "asr") (WS '# "mul vl" # multiply offset by the hardware's vector length ARMRegister <- "sp" / ([xwdqshb] [0-9] [0-9]? !(ARMRegisterBoundary)) / "xzr" / "wzr" / "NZCV" / ARMVectorRegister / SVE2PredicateRegister / ('{' WS? ARMVectorRegister WS? ([,\-] WS? ARMVectorRegister)* WS? '}' ('[' [0-9] [0-9]? ']')? ) -ARMVectorRegister <- [vz] [0-9] [0-9]? ('.' [0-9]* [bsdhq] ('[' [0-9] [0-9]? ']')? )? +ARMVectorRegister <- [vz] [0-9] [0-9]? ('.' [0-9]* [bsdhqBSDHQ] ('[' [0-9] [0-9]? ']')? )? SVE2PredicateRegister <- "p" [0-9] [0-9]? "/" [mMzZ] ARMRegisterBoundary <- [a-zA-Z0-9_] # Compilers only output a very limited number of expression forms. Rather than diff --git a/util/fipstools/delocate/delocate.peg.go b/util/fipstools/delocate/delocate.peg.go index fba8d272f5..fe2024612a 100644 --- a/util/fipstools/delocate/delocate.peg.go +++ b/util/fipstools/delocate/delocate.peg.go @@ -5892,7 +5892,7 @@ func (p *Asm) Init(options ...func(*Asm) error) error { position, tokenIndex = position690, tokenIndex690 return false }, - /* 45 ARMVectorRegister <- <(('v' / 'z') [0-9] [0-9]? ('.' [0-9]* ('b' / 's' / 'd' / 'h' / 'q') ('[' [0-9] [0-9]? ']')?)?)> */ + /* 45 ARMVectorRegister <- <(('v' / 'z') [0-9] [0-9]? ('.' [0-9]* ('b' / 's' / 'd' / 'h' / 'q' / 'B' / 'S' / 'D' / 'H' / 'Q') ('[' [0-9] [0-9]? ']')?)?)> */ func() bool { position750, tokenIndex750 := position, tokenIndex { @@ -5975,41 +5975,76 @@ func (p *Asm) Init(options ...func(*Asm) error) error { l764: position, tokenIndex = position760, tokenIndex760 if buffer[position] != rune('q') { + goto l765 + } + position++ + goto l760 + l765: + position, tokenIndex = position760, tokenIndex760 + if buffer[position] != rune('B') { + goto l766 + } + position++ + goto l760 + l766: + position, tokenIndex = position760, tokenIndex760 + if buffer[position] != rune('S') { + goto l767 + } + position++ + goto l760 + l767: + position, tokenIndex = position760, tokenIndex760 + if buffer[position] != rune('D') { + goto l768 + } + position++ + goto l760 + l768: + position, tokenIndex = position760, tokenIndex760 + if buffer[position] != rune('H') { + goto l769 + } + position++ + goto l760 + l769: + position, tokenIndex = position760, tokenIndex760 + if buffer[position] != rune('Q') { goto l756 } position++ } l760: { - position765, tokenIndex765 := position, tokenIndex + position770, tokenIndex770 := position, tokenIndex if buffer[position] != rune('[') { - goto l765 + goto l770 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l765 + goto l770 } position++ { - position767, tokenIndex767 := position, tokenIndex + position772, tokenIndex772 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l767 + goto l772 } position++ - goto l768 - l767: - position, tokenIndex = position767, tokenIndex767 + goto l773 + l772: + position, tokenIndex = position772, tokenIndex772 } - l768: + l773: if buffer[position] != rune(']') { - goto l765 + goto l770 } position++ - goto l766 - l765: - position, tokenIndex = position765, tokenIndex765 + goto l771 + l770: + position, tokenIndex = position770, tokenIndex770 } - l766: + l771: goto l757 l756: position, tokenIndex = position756, tokenIndex756 @@ -6024,1787 +6059,1787 @@ func (p *Asm) Init(options ...func(*Asm) error) error { }, /* 46 SVE2PredicateRegister <- <(('p' / 'P') [0-9] [0-9]? '/' ('m' / 'M' / 'z' / 'Z'))> */ func() bool { - position769, tokenIndex769 := position, tokenIndex + position774, tokenIndex774 := position, tokenIndex { - position770 := position + position775 := position { - position771, tokenIndex771 := position, tokenIndex + position776, tokenIndex776 := position, tokenIndex if buffer[position] != rune('p') { - goto l772 + goto l777 } position++ - goto l771 - l772: - position, tokenIndex = position771, tokenIndex771 + goto l776 + l777: + position, tokenIndex = position776, tokenIndex776 if buffer[position] != rune('P') { - goto l769 + goto l774 } position++ } - l771: + l776: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l769 + goto l774 } position++ { - position773, tokenIndex773 := position, tokenIndex + position778, tokenIndex778 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l773 + goto l778 } position++ - goto l774 - l773: - position, tokenIndex = position773, tokenIndex773 + goto l779 + l778: + position, tokenIndex = position778, tokenIndex778 } - l774: + l779: if buffer[position] != rune('/') { - goto l769 + goto l774 } position++ { - position775, tokenIndex775 := position, tokenIndex + position780, tokenIndex780 := position, tokenIndex if buffer[position] != rune('m') { - goto l776 + goto l781 } position++ - goto l775 - l776: - position, tokenIndex = position775, tokenIndex775 + goto l780 + l781: + position, tokenIndex = position780, tokenIndex780 if buffer[position] != rune('M') { - goto l777 + goto l782 } position++ - goto l775 - l777: - position, tokenIndex = position775, tokenIndex775 + goto l780 + l782: + position, tokenIndex = position780, tokenIndex780 if buffer[position] != rune('z') { - goto l778 + goto l783 } position++ - goto l775 - l778: - position, tokenIndex = position775, tokenIndex775 + goto l780 + l783: + position, tokenIndex = position780, tokenIndex780 if buffer[position] != rune('Z') { - goto l769 + goto l774 } position++ } - l775: - add(ruleSVE2PredicateRegister, position770) + l780: + add(ruleSVE2PredicateRegister, position775) } return true - l769: - position, tokenIndex = position769, tokenIndex769 + l774: + position, tokenIndex = position774, tokenIndex774 return false }, /* 47 ARMRegisterBoundary <- <([a-z] / [A-Z] / [0-9] / '_')> */ func() bool { - position779, tokenIndex779 := position, tokenIndex + position784, tokenIndex784 := position, tokenIndex { - position780 := position + position785 := position { - position781, tokenIndex781 := position, tokenIndex + position786, tokenIndex786 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l782 + goto l787 } position++ - goto l781 - l782: - position, tokenIndex = position781, tokenIndex781 + goto l786 + l787: + position, tokenIndex = position786, tokenIndex786 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l783 + goto l788 } position++ - goto l781 - l783: - position, tokenIndex = position781, tokenIndex781 + goto l786 + l788: + position, tokenIndex = position786, tokenIndex786 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l784 + goto l789 } position++ - goto l781 - l784: - position, tokenIndex = position781, tokenIndex781 + goto l786 + l789: + position, tokenIndex = position786, tokenIndex786 if buffer[position] != rune('_') { - goto l779 + goto l784 } position++ } - l781: - add(ruleARMRegisterBoundary, position780) + l786: + add(ruleARMRegisterBoundary, position785) } return true - l779: - position, tokenIndex = position779, tokenIndex779 + l784: + position, tokenIndex = position784, tokenIndex784 return false }, /* 48 MemoryRef <- <((SymbolRef BaseIndexScale) / SymbolRef / Low12BitsSymbolRef / (Offset* BaseIndexScale) / (SegmentRegister Offset BaseIndexScale) / (SegmentRegister BaseIndexScale) / (SegmentRegister Offset) / ARMBaseIndexScale / BaseIndexScale)> */ func() bool { - position785, tokenIndex785 := position, tokenIndex + position790, tokenIndex790 := position, tokenIndex { - position786 := position + position791 := position { - position787, tokenIndex787 := position, tokenIndex + position792, tokenIndex792 := position, tokenIndex if !_rules[ruleSymbolRef]() { - goto l788 + goto l793 } if !_rules[ruleBaseIndexScale]() { - goto l788 + goto l793 } - goto l787 - l788: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l793: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleSymbolRef]() { - goto l789 + goto l794 } - goto l787 - l789: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l794: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleLow12BitsSymbolRef]() { - goto l790 + goto l795 } - goto l787 - l790: - position, tokenIndex = position787, tokenIndex787 - l792: + goto l792 + l795: + position, tokenIndex = position792, tokenIndex792 + l797: { - position793, tokenIndex793 := position, tokenIndex + position798, tokenIndex798 := position, tokenIndex if !_rules[ruleOffset]() { - goto l793 + goto l798 } - goto l792 - l793: - position, tokenIndex = position793, tokenIndex793 + goto l797 + l798: + position, tokenIndex = position798, tokenIndex798 } if !_rules[ruleBaseIndexScale]() { - goto l791 + goto l796 } - goto l787 - l791: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l796: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleSegmentRegister]() { - goto l794 + goto l799 } if !_rules[ruleOffset]() { - goto l794 + goto l799 } if !_rules[ruleBaseIndexScale]() { - goto l794 + goto l799 } - goto l787 - l794: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l799: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleSegmentRegister]() { - goto l795 + goto l800 } if !_rules[ruleBaseIndexScale]() { - goto l795 + goto l800 } - goto l787 - l795: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l800: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleSegmentRegister]() { - goto l796 + goto l801 } if !_rules[ruleOffset]() { - goto l796 + goto l801 } - goto l787 - l796: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l801: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleARMBaseIndexScale]() { - goto l797 + goto l802 } - goto l787 - l797: - position, tokenIndex = position787, tokenIndex787 + goto l792 + l802: + position, tokenIndex = position792, tokenIndex792 if !_rules[ruleBaseIndexScale]() { - goto l785 + goto l790 } } - l787: - add(ruleMemoryRef, position786) + l792: + add(ruleMemoryRef, position791) } return true - l785: - position, tokenIndex = position785, tokenIndex785 + l790: + position, tokenIndex = position790, tokenIndex790 return false }, /* 49 SymbolRef <- <((Offset* '+')? (LocalSymbol / SymbolName) Offset* ('@' Section Offset*)?)> */ func() bool { - position798, tokenIndex798 := position, tokenIndex + position803, tokenIndex803 := position, tokenIndex { - position799 := position + position804 := position { - position800, tokenIndex800 := position, tokenIndex - l802: + position805, tokenIndex805 := position, tokenIndex + l807: { - position803, tokenIndex803 := position, tokenIndex + position808, tokenIndex808 := position, tokenIndex if !_rules[ruleOffset]() { - goto l803 + goto l808 } - goto l802 - l803: - position, tokenIndex = position803, tokenIndex803 + goto l807 + l808: + position, tokenIndex = position808, tokenIndex808 } if buffer[position] != rune('+') { - goto l800 + goto l805 } position++ - goto l801 - l800: - position, tokenIndex = position800, tokenIndex800 + goto l806 + l805: + position, tokenIndex = position805, tokenIndex805 } - l801: + l806: { - position804, tokenIndex804 := position, tokenIndex + position809, tokenIndex809 := position, tokenIndex if !_rules[ruleLocalSymbol]() { - goto l805 + goto l810 } - goto l804 - l805: - position, tokenIndex = position804, tokenIndex804 + goto l809 + l810: + position, tokenIndex = position809, tokenIndex809 if !_rules[ruleSymbolName]() { - goto l798 + goto l803 } } - l804: - l806: + l809: + l811: { - position807, tokenIndex807 := position, tokenIndex + position812, tokenIndex812 := position, tokenIndex if !_rules[ruleOffset]() { - goto l807 + goto l812 } - goto l806 - l807: - position, tokenIndex = position807, tokenIndex807 + goto l811 + l812: + position, tokenIndex = position812, tokenIndex812 } { - position808, tokenIndex808 := position, tokenIndex + position813, tokenIndex813 := position, tokenIndex if buffer[position] != rune('@') { - goto l808 + goto l813 } position++ if !_rules[ruleSection]() { - goto l808 + goto l813 } - l810: + l815: { - position811, tokenIndex811 := position, tokenIndex + position816, tokenIndex816 := position, tokenIndex if !_rules[ruleOffset]() { - goto l811 + goto l816 } - goto l810 - l811: - position, tokenIndex = position811, tokenIndex811 + goto l815 + l816: + position, tokenIndex = position816, tokenIndex816 } - goto l809 - l808: - position, tokenIndex = position808, tokenIndex808 + goto l814 + l813: + position, tokenIndex = position813, tokenIndex813 } - l809: - add(ruleSymbolRef, position799) + l814: + add(ruleSymbolRef, position804) } return true - l798: - position, tokenIndex = position798, tokenIndex798 + l803: + position, tokenIndex = position803, tokenIndex803 return false }, /* 50 Low12BitsSymbolRef <- <(':' ('l' / 'L') ('o' / 'O') '1' '2' ':' (LocalSymbol / SymbolName) Offset?)> */ func() bool { - position812, tokenIndex812 := position, tokenIndex + position817, tokenIndex817 := position, tokenIndex { - position813 := position + position818 := position if buffer[position] != rune(':') { - goto l812 + goto l817 } position++ { - position814, tokenIndex814 := position, tokenIndex + position819, tokenIndex819 := position, tokenIndex if buffer[position] != rune('l') { - goto l815 + goto l820 } position++ - goto l814 - l815: - position, tokenIndex = position814, tokenIndex814 + goto l819 + l820: + position, tokenIndex = position819, tokenIndex819 if buffer[position] != rune('L') { - goto l812 + goto l817 } position++ } - l814: + l819: { - position816, tokenIndex816 := position, tokenIndex + position821, tokenIndex821 := position, tokenIndex if buffer[position] != rune('o') { - goto l817 + goto l822 } position++ - goto l816 - l817: - position, tokenIndex = position816, tokenIndex816 + goto l821 + l822: + position, tokenIndex = position821, tokenIndex821 if buffer[position] != rune('O') { - goto l812 + goto l817 } position++ } - l816: + l821: if buffer[position] != rune('1') { - goto l812 + goto l817 } position++ if buffer[position] != rune('2') { - goto l812 + goto l817 } position++ if buffer[position] != rune(':') { - goto l812 + goto l817 } position++ { - position818, tokenIndex818 := position, tokenIndex + position823, tokenIndex823 := position, tokenIndex if !_rules[ruleLocalSymbol]() { - goto l819 + goto l824 } - goto l818 - l819: - position, tokenIndex = position818, tokenIndex818 + goto l823 + l824: + position, tokenIndex = position823, tokenIndex823 if !_rules[ruleSymbolName]() { - goto l812 + goto l817 } } - l818: + l823: { - position820, tokenIndex820 := position, tokenIndex + position825, tokenIndex825 := position, tokenIndex if !_rules[ruleOffset]() { - goto l820 + goto l825 } - goto l821 - l820: - position, tokenIndex = position820, tokenIndex820 + goto l826 + l825: + position, tokenIndex = position825, tokenIndex825 } - l821: - add(ruleLow12BitsSymbolRef, position813) + l826: + add(ruleLow12BitsSymbolRef, position818) } return true - l812: - position, tokenIndex = position812, tokenIndex812 + l817: + position, tokenIndex = position817, tokenIndex817 return false }, /* 51 ARMBaseIndexScale <- <('[' ARMRegister (',' WS? (('#'? Offset (('*' [0-9]+) / ('*' '(' [0-9]+ Operator [0-9]+ ')') / ('+' [0-9]+)*)?) / ('#'? ARMGOTLow12) / ('#'? Low12BitsSymbolRef) / ARMRegister) (',' WS? ARMConstantTweak)?)? ']' ARMPostincrement?)> */ func() bool { - position822, tokenIndex822 := position, tokenIndex + position827, tokenIndex827 := position, tokenIndex { - position823 := position + position828 := position if buffer[position] != rune('[') { - goto l822 + goto l827 } position++ if !_rules[ruleARMRegister]() { - goto l822 + goto l827 } { - position824, tokenIndex824 := position, tokenIndex + position829, tokenIndex829 := position, tokenIndex if buffer[position] != rune(',') { - goto l824 + goto l829 } position++ { - position826, tokenIndex826 := position, tokenIndex + position831, tokenIndex831 := position, tokenIndex if !_rules[ruleWS]() { - goto l826 + goto l831 } - goto l827 - l826: - position, tokenIndex = position826, tokenIndex826 + goto l832 + l831: + position, tokenIndex = position831, tokenIndex831 } - l827: + l832: { - position828, tokenIndex828 := position, tokenIndex + position833, tokenIndex833 := position, tokenIndex { - position830, tokenIndex830 := position, tokenIndex + position835, tokenIndex835 := position, tokenIndex if buffer[position] != rune('#') { - goto l830 + goto l835 } position++ - goto l831 - l830: - position, tokenIndex = position830, tokenIndex830 + goto l836 + l835: + position, tokenIndex = position835, tokenIndex835 } - l831: + l836: if !_rules[ruleOffset]() { - goto l829 + goto l834 } { - position832, tokenIndex832 := position, tokenIndex + position837, tokenIndex837 := position, tokenIndex { - position834, tokenIndex834 := position, tokenIndex + position839, tokenIndex839 := position, tokenIndex if buffer[position] != rune('*') { - goto l835 + goto l840 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l835 + goto l840 } position++ - l836: + l841: { - position837, tokenIndex837 := position, tokenIndex + position842, tokenIndex842 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l837 + goto l842 } position++ - goto l836 - l837: - position, tokenIndex = position837, tokenIndex837 + goto l841 + l842: + position, tokenIndex = position842, tokenIndex842 } - goto l834 - l835: - position, tokenIndex = position834, tokenIndex834 + goto l839 + l840: + position, tokenIndex = position839, tokenIndex839 if buffer[position] != rune('*') { - goto l838 + goto l843 } position++ if buffer[position] != rune('(') { - goto l838 + goto l843 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l838 + goto l843 } position++ - l839: + l844: { - position840, tokenIndex840 := position, tokenIndex + position845, tokenIndex845 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l840 + goto l845 } position++ - goto l839 - l840: - position, tokenIndex = position840, tokenIndex840 + goto l844 + l845: + position, tokenIndex = position845, tokenIndex845 } if !_rules[ruleOperator]() { - goto l838 + goto l843 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l838 + goto l843 } position++ - l841: + l846: { - position842, tokenIndex842 := position, tokenIndex + position847, tokenIndex847 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l842 + goto l847 } position++ - goto l841 - l842: - position, tokenIndex = position842, tokenIndex842 + goto l846 + l847: + position, tokenIndex = position847, tokenIndex847 } if buffer[position] != rune(')') { - goto l838 + goto l843 } position++ - goto l834 - l838: - position, tokenIndex = position834, tokenIndex834 + goto l839 l843: + position, tokenIndex = position839, tokenIndex839 + l848: { - position844, tokenIndex844 := position, tokenIndex + position849, tokenIndex849 := position, tokenIndex if buffer[position] != rune('+') { - goto l844 + goto l849 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l844 + goto l849 } position++ - l845: + l850: { - position846, tokenIndex846 := position, tokenIndex + position851, tokenIndex851 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l846 + goto l851 } position++ - goto l845 - l846: - position, tokenIndex = position846, tokenIndex846 + goto l850 + l851: + position, tokenIndex = position851, tokenIndex851 } - goto l843 - l844: - position, tokenIndex = position844, tokenIndex844 + goto l848 + l849: + position, tokenIndex = position849, tokenIndex849 } } - l834: - goto l833 + l839: + goto l838 - position, tokenIndex = position832, tokenIndex832 + position, tokenIndex = position837, tokenIndex837 } - l833: - goto l828 - l829: - position, tokenIndex = position828, tokenIndex828 + l838: + goto l833 + l834: + position, tokenIndex = position833, tokenIndex833 { - position848, tokenIndex848 := position, tokenIndex + position853, tokenIndex853 := position, tokenIndex if buffer[position] != rune('#') { - goto l848 + goto l853 } position++ - goto l849 - l848: - position, tokenIndex = position848, tokenIndex848 + goto l854 + l853: + position, tokenIndex = position853, tokenIndex853 } - l849: + l854: if !_rules[ruleARMGOTLow12]() { - goto l847 + goto l852 } - goto l828 - l847: - position, tokenIndex = position828, tokenIndex828 + goto l833 + l852: + position, tokenIndex = position833, tokenIndex833 { - position851, tokenIndex851 := position, tokenIndex + position856, tokenIndex856 := position, tokenIndex if buffer[position] != rune('#') { - goto l851 + goto l856 } position++ - goto l852 - l851: - position, tokenIndex = position851, tokenIndex851 + goto l857 + l856: + position, tokenIndex = position856, tokenIndex856 } - l852: + l857: if !_rules[ruleLow12BitsSymbolRef]() { - goto l850 + goto l855 } - goto l828 - l850: - position, tokenIndex = position828, tokenIndex828 + goto l833 + l855: + position, tokenIndex = position833, tokenIndex833 if !_rules[ruleARMRegister]() { - goto l824 + goto l829 } } - l828: + l833: { - position853, tokenIndex853 := position, tokenIndex + position858, tokenIndex858 := position, tokenIndex if buffer[position] != rune(',') { - goto l853 + goto l858 } position++ { - position855, tokenIndex855 := position, tokenIndex + position860, tokenIndex860 := position, tokenIndex if !_rules[ruleWS]() { - goto l855 + goto l860 } - goto l856 - l855: - position, tokenIndex = position855, tokenIndex855 + goto l861 + l860: + position, tokenIndex = position860, tokenIndex860 } - l856: + l861: if !_rules[ruleARMConstantTweak]() { - goto l853 + goto l858 } - goto l854 - l853: - position, tokenIndex = position853, tokenIndex853 + goto l859 + l858: + position, tokenIndex = position858, tokenIndex858 } - l854: - goto l825 - l824: - position, tokenIndex = position824, tokenIndex824 + l859: + goto l830 + l829: + position, tokenIndex = position829, tokenIndex829 } - l825: + l830: if buffer[position] != rune(']') { - goto l822 + goto l827 } position++ { - position857, tokenIndex857 := position, tokenIndex + position862, tokenIndex862 := position, tokenIndex if !_rules[ruleARMPostincrement]() { - goto l857 + goto l862 } - goto l858 - l857: - position, tokenIndex = position857, tokenIndex857 + goto l863 + l862: + position, tokenIndex = position862, tokenIndex862 } - l858: - add(ruleARMBaseIndexScale, position823) + l863: + add(ruleARMBaseIndexScale, position828) } return true - l822: - position, tokenIndex = position822, tokenIndex822 + l827: + position, tokenIndex = position827, tokenIndex827 return false }, /* 52 ARMGOTLow12 <- <(':' ('g' / 'G') ('o' / 'O') ('t' / 'T') '_' ('l' / 'L') ('o' / 'O') '1' '2' ':' SymbolName)> */ func() bool { - position859, tokenIndex859 := position, tokenIndex + position864, tokenIndex864 := position, tokenIndex { - position860 := position + position865 := position if buffer[position] != rune(':') { - goto l859 + goto l864 } position++ { - position861, tokenIndex861 := position, tokenIndex + position866, tokenIndex866 := position, tokenIndex if buffer[position] != rune('g') { - goto l862 + goto l867 } position++ - goto l861 - l862: - position, tokenIndex = position861, tokenIndex861 + goto l866 + l867: + position, tokenIndex = position866, tokenIndex866 if buffer[position] != rune('G') { - goto l859 + goto l864 } position++ } - l861: + l866: { - position863, tokenIndex863 := position, tokenIndex + position868, tokenIndex868 := position, tokenIndex if buffer[position] != rune('o') { - goto l864 + goto l869 } position++ - goto l863 - l864: - position, tokenIndex = position863, tokenIndex863 + goto l868 + l869: + position, tokenIndex = position868, tokenIndex868 if buffer[position] != rune('O') { - goto l859 + goto l864 } position++ } - l863: + l868: { - position865, tokenIndex865 := position, tokenIndex + position870, tokenIndex870 := position, tokenIndex if buffer[position] != rune('t') { - goto l866 + goto l871 } position++ - goto l865 - l866: - position, tokenIndex = position865, tokenIndex865 + goto l870 + l871: + position, tokenIndex = position870, tokenIndex870 if buffer[position] != rune('T') { - goto l859 + goto l864 } position++ } - l865: + l870: if buffer[position] != rune('_') { - goto l859 + goto l864 } position++ { - position867, tokenIndex867 := position, tokenIndex + position872, tokenIndex872 := position, tokenIndex if buffer[position] != rune('l') { - goto l868 + goto l873 } position++ - goto l867 - l868: - position, tokenIndex = position867, tokenIndex867 + goto l872 + l873: + position, tokenIndex = position872, tokenIndex872 if buffer[position] != rune('L') { - goto l859 + goto l864 } position++ } - l867: + l872: { - position869, tokenIndex869 := position, tokenIndex + position874, tokenIndex874 := position, tokenIndex if buffer[position] != rune('o') { - goto l870 + goto l875 } position++ - goto l869 - l870: - position, tokenIndex = position869, tokenIndex869 + goto l874 + l875: + position, tokenIndex = position874, tokenIndex874 if buffer[position] != rune('O') { - goto l859 + goto l864 } position++ } - l869: + l874: if buffer[position] != rune('1') { - goto l859 + goto l864 } position++ if buffer[position] != rune('2') { - goto l859 + goto l864 } position++ if buffer[position] != rune(':') { - goto l859 + goto l864 } position++ if !_rules[ruleSymbolName]() { - goto l859 + goto l864 } - add(ruleARMGOTLow12, position860) + add(ruleARMGOTLow12, position865) } return true - l859: - position, tokenIndex = position859, tokenIndex859 + l864: + position, tokenIndex = position864, tokenIndex864 return false }, /* 53 ARMPostincrement <- <'!'> */ func() bool { - position871, tokenIndex871 := position, tokenIndex + position876, tokenIndex876 := position, tokenIndex { - position872 := position + position877 := position if buffer[position] != rune('!') { - goto l871 + goto l876 } position++ - add(ruleARMPostincrement, position872) + add(ruleARMPostincrement, position877) } return true - l871: - position, tokenIndex = position871, tokenIndex871 + l876: + position, tokenIndex = position876, tokenIndex876 return false }, /* 54 BaseIndexScale <- <('(' RegisterOrConstant? WS? (',' WS? RegisterOrConstant WS? (',' [0-9]+)?)? ')')> */ func() bool { - position873, tokenIndex873 := position, tokenIndex + position878, tokenIndex878 := position, tokenIndex { - position874 := position + position879 := position if buffer[position] != rune('(') { - goto l873 + goto l878 } position++ { - position875, tokenIndex875 := position, tokenIndex + position880, tokenIndex880 := position, tokenIndex if !_rules[ruleRegisterOrConstant]() { - goto l875 + goto l880 } - goto l876 - l875: - position, tokenIndex = position875, tokenIndex875 + goto l881 + l880: + position, tokenIndex = position880, tokenIndex880 } - l876: + l881: { - position877, tokenIndex877 := position, tokenIndex + position882, tokenIndex882 := position, tokenIndex if !_rules[ruleWS]() { - goto l877 + goto l882 } - goto l878 - l877: - position, tokenIndex = position877, tokenIndex877 + goto l883 + l882: + position, tokenIndex = position882, tokenIndex882 } - l878: + l883: { - position879, tokenIndex879 := position, tokenIndex + position884, tokenIndex884 := position, tokenIndex if buffer[position] != rune(',') { - goto l879 + goto l884 } position++ { - position881, tokenIndex881 := position, tokenIndex + position886, tokenIndex886 := position, tokenIndex if !_rules[ruleWS]() { - goto l881 + goto l886 } - goto l882 - l881: - position, tokenIndex = position881, tokenIndex881 + goto l887 + l886: + position, tokenIndex = position886, tokenIndex886 } - l882: + l887: if !_rules[ruleRegisterOrConstant]() { - goto l879 + goto l884 } { - position883, tokenIndex883 := position, tokenIndex + position888, tokenIndex888 := position, tokenIndex if !_rules[ruleWS]() { - goto l883 + goto l888 } - goto l884 - l883: - position, tokenIndex = position883, tokenIndex883 + goto l889 + l888: + position, tokenIndex = position888, tokenIndex888 } - l884: + l889: { - position885, tokenIndex885 := position, tokenIndex + position890, tokenIndex890 := position, tokenIndex if buffer[position] != rune(',') { - goto l885 + goto l890 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l885 + goto l890 } position++ - l887: + l892: { - position888, tokenIndex888 := position, tokenIndex + position893, tokenIndex893 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l888 + goto l893 } position++ - goto l887 - l888: - position, tokenIndex = position888, tokenIndex888 + goto l892 + l893: + position, tokenIndex = position893, tokenIndex893 } - goto l886 - l885: - position, tokenIndex = position885, tokenIndex885 + goto l891 + l890: + position, tokenIndex = position890, tokenIndex890 } - l886: - goto l880 - l879: - position, tokenIndex = position879, tokenIndex879 + l891: + goto l885 + l884: + position, tokenIndex = position884, tokenIndex884 } - l880: + l885: if buffer[position] != rune(')') { - goto l873 + goto l878 } position++ - add(ruleBaseIndexScale, position874) + add(ruleBaseIndexScale, position879) } return true - l873: - position, tokenIndex = position873, tokenIndex873 + l878: + position, tokenIndex = position878, tokenIndex878 return false }, /* 55 Operator <- <('+' / '-')> */ func() bool { - position889, tokenIndex889 := position, tokenIndex + position894, tokenIndex894 := position, tokenIndex { - position890 := position + position895 := position { - position891, tokenIndex891 := position, tokenIndex + position896, tokenIndex896 := position, tokenIndex if buffer[position] != rune('+') { - goto l892 + goto l897 } position++ - goto l891 - l892: - position, tokenIndex = position891, tokenIndex891 + goto l896 + l897: + position, tokenIndex = position896, tokenIndex896 if buffer[position] != rune('-') { - goto l889 + goto l894 } position++ } - l891: - add(ruleOperator, position890) + l896: + add(ruleOperator, position895) } return true - l889: - position, tokenIndex = position889, tokenIndex889 + l894: + position, tokenIndex = position894, tokenIndex894 return false }, /* 56 OffsetOperator <- <('+' / '-' / '*')> */ func() bool { - position893, tokenIndex893 := position, tokenIndex + position898, tokenIndex898 := position, tokenIndex { - position894 := position + position899 := position { - position895, tokenIndex895 := position, tokenIndex + position900, tokenIndex900 := position, tokenIndex if buffer[position] != rune('+') { - goto l896 + goto l901 } position++ - goto l895 - l896: - position, tokenIndex = position895, tokenIndex895 + goto l900 + l901: + position, tokenIndex = position900, tokenIndex900 if buffer[position] != rune('-') { - goto l897 + goto l902 } position++ - goto l895 - l897: - position, tokenIndex = position895, tokenIndex895 + goto l900 + l902: + position, tokenIndex = position900, tokenIndex900 if buffer[position] != rune('*') { - goto l893 + goto l898 } position++ } - l895: - add(ruleOffsetOperator, position894) + l900: + add(ruleOffsetOperator, position899) } return true - l893: - position, tokenIndex = position893, tokenIndex893 + l898: + position, tokenIndex = position898, tokenIndex898 return false }, /* 57 Offset <- <('+'? '-'? (('0' ('b' / 'B') ('0' / '1')+) / ('0' ('x' / 'X') ([0-9] / [0-9] / ([a-f] / [A-F]))+) / ((([0-9]+ WS OffsetOperator [0-9]+) / ([0-9]+ (OffsetOperator '(' [0-9]+ OffsetOperator [0-9]+ ')')?) / ([0-9]+ (OffsetOperator [0-9]+ OffsetOperator [0-9]+)?) / ([0-9]+ (OffsetOperator [0-9]+)?) / ('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ OffsetOperator [0-9]+) / ('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')' OffsetOperator [0-9]+ !'x') / ('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ ')') / ('(' [0-9]+ WS? OffsetOperator WS? [0-9]+ WS? OffsetOperator WS? [0-9]+ ')')) !([a-z] / [A-Z]))))> */ func() bool { - position898, tokenIndex898 := position, tokenIndex + position903, tokenIndex903 := position, tokenIndex { - position899 := position + position904 := position { - position900, tokenIndex900 := position, tokenIndex + position905, tokenIndex905 := position, tokenIndex if buffer[position] != rune('+') { - goto l900 + goto l905 } position++ - goto l901 - l900: - position, tokenIndex = position900, tokenIndex900 + goto l906 + l905: + position, tokenIndex = position905, tokenIndex905 } - l901: + l906: { - position902, tokenIndex902 := position, tokenIndex + position907, tokenIndex907 := position, tokenIndex if buffer[position] != rune('-') { - goto l902 + goto l907 } position++ - goto l903 - l902: - position, tokenIndex = position902, tokenIndex902 + goto l908 + l907: + position, tokenIndex = position907, tokenIndex907 } - l903: + l908: { - position904, tokenIndex904 := position, tokenIndex + position909, tokenIndex909 := position, tokenIndex if buffer[position] != rune('0') { - goto l905 + goto l910 } position++ { - position906, tokenIndex906 := position, tokenIndex + position911, tokenIndex911 := position, tokenIndex if buffer[position] != rune('b') { - goto l907 + goto l912 } position++ - goto l906 - l907: - position, tokenIndex = position906, tokenIndex906 + goto l911 + l912: + position, tokenIndex = position911, tokenIndex911 if buffer[position] != rune('B') { - goto l905 + goto l910 } position++ } - l906: + l911: { - position910, tokenIndex910 := position, tokenIndex + position915, tokenIndex915 := position, tokenIndex if buffer[position] != rune('0') { - goto l911 + goto l916 } position++ - goto l910 - l911: - position, tokenIndex = position910, tokenIndex910 + goto l915 + l916: + position, tokenIndex = position915, tokenIndex915 if buffer[position] != rune('1') { - goto l905 + goto l910 } position++ } - l910: - l908: + l915: + l913: { - position909, tokenIndex909 := position, tokenIndex + position914, tokenIndex914 := position, tokenIndex { - position912, tokenIndex912 := position, tokenIndex + position917, tokenIndex917 := position, tokenIndex if buffer[position] != rune('0') { - goto l913 + goto l918 } position++ - goto l912 - l913: - position, tokenIndex = position912, tokenIndex912 + goto l917 + l918: + position, tokenIndex = position917, tokenIndex917 if buffer[position] != rune('1') { - goto l909 + goto l914 } position++ } - l912: - goto l908 - l909: - position, tokenIndex = position909, tokenIndex909 + l917: + goto l913 + l914: + position, tokenIndex = position914, tokenIndex914 } - goto l904 - l905: - position, tokenIndex = position904, tokenIndex904 + goto l909 + l910: + position, tokenIndex = position909, tokenIndex909 if buffer[position] != rune('0') { - goto l914 + goto l919 } position++ { - position915, tokenIndex915 := position, tokenIndex + position920, tokenIndex920 := position, tokenIndex if buffer[position] != rune('x') { - goto l916 + goto l921 } position++ - goto l915 - l916: - position, tokenIndex = position915, tokenIndex915 + goto l920 + l921: + position, tokenIndex = position920, tokenIndex920 if buffer[position] != rune('X') { - goto l914 + goto l919 } position++ } - l915: + l920: { - position919, tokenIndex919 := position, tokenIndex + position924, tokenIndex924 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l920 + goto l925 } position++ - goto l919 - l920: - position, tokenIndex = position919, tokenIndex919 + goto l924 + l925: + position, tokenIndex = position924, tokenIndex924 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l921 + goto l926 } position++ - goto l919 - l921: - position, tokenIndex = position919, tokenIndex919 + goto l924 + l926: + position, tokenIndex = position924, tokenIndex924 { - position922, tokenIndex922 := position, tokenIndex + position927, tokenIndex927 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('f') { - goto l923 + goto l928 } position++ - goto l922 - l923: - position, tokenIndex = position922, tokenIndex922 + goto l927 + l928: + position, tokenIndex = position927, tokenIndex927 if c := buffer[position]; c < rune('A') || c > rune('F') { - goto l914 + goto l919 } position++ } - l922: + l927: } - l919: - l917: + l924: + l922: { - position918, tokenIndex918 := position, tokenIndex + position923, tokenIndex923 := position, tokenIndex { - position924, tokenIndex924 := position, tokenIndex + position929, tokenIndex929 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l925 + goto l930 } position++ - goto l924 - l925: - position, tokenIndex = position924, tokenIndex924 + goto l929 + l930: + position, tokenIndex = position929, tokenIndex929 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l926 + goto l931 } position++ - goto l924 - l926: - position, tokenIndex = position924, tokenIndex924 + goto l929 + l931: + position, tokenIndex = position929, tokenIndex929 { - position927, tokenIndex927 := position, tokenIndex + position932, tokenIndex932 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('f') { - goto l928 + goto l933 } position++ - goto l927 - l928: - position, tokenIndex = position927, tokenIndex927 + goto l932 + l933: + position, tokenIndex = position932, tokenIndex932 if c := buffer[position]; c < rune('A') || c > rune('F') { - goto l918 + goto l923 } position++ } - l927: + l932: } - l924: - goto l917 - l918: - position, tokenIndex = position918, tokenIndex918 + l929: + goto l922 + l923: + position, tokenIndex = position923, tokenIndex923 } - goto l904 - l914: - position, tokenIndex = position904, tokenIndex904 + goto l909 + l919: + position, tokenIndex = position909, tokenIndex909 { - position929, tokenIndex929 := position, tokenIndex + position934, tokenIndex934 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l930 + goto l935 } position++ - l931: + l936: { - position932, tokenIndex932 := position, tokenIndex + position937, tokenIndex937 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l932 + goto l937 } position++ - goto l931 - l932: - position, tokenIndex = position932, tokenIndex932 + goto l936 + l937: + position, tokenIndex = position937, tokenIndex937 } if !_rules[ruleWS]() { - goto l930 + goto l935 } if !_rules[ruleOffsetOperator]() { - goto l930 + goto l935 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l930 + goto l935 } position++ - l933: + l938: { - position934, tokenIndex934 := position, tokenIndex + position939, tokenIndex939 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l934 + goto l939 } position++ - goto l933 - l934: - position, tokenIndex = position934, tokenIndex934 + goto l938 + l939: + position, tokenIndex = position939, tokenIndex939 } - goto l929 - l930: - position, tokenIndex = position929, tokenIndex929 + goto l934 + l935: + position, tokenIndex = position934, tokenIndex934 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l935 + goto l940 } position++ - l936: + l941: { - position937, tokenIndex937 := position, tokenIndex + position942, tokenIndex942 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l937 + goto l942 } position++ - goto l936 - l937: - position, tokenIndex = position937, tokenIndex937 + goto l941 + l942: + position, tokenIndex = position942, tokenIndex942 } { - position938, tokenIndex938 := position, tokenIndex + position943, tokenIndex943 := position, tokenIndex if !_rules[ruleOffsetOperator]() { - goto l938 + goto l943 } if buffer[position] != rune('(') { - goto l938 + goto l943 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l938 + goto l943 } position++ - l940: + l945: { - position941, tokenIndex941 := position, tokenIndex + position946, tokenIndex946 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l941 + goto l946 } position++ - goto l940 - l941: - position, tokenIndex = position941, tokenIndex941 + goto l945 + l946: + position, tokenIndex = position946, tokenIndex946 } if !_rules[ruleOffsetOperator]() { - goto l938 + goto l943 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l938 + goto l943 } position++ - l942: + l947: { - position943, tokenIndex943 := position, tokenIndex + position948, tokenIndex948 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l943 + goto l948 } position++ - goto l942 - l943: - position, tokenIndex = position943, tokenIndex943 + goto l947 + l948: + position, tokenIndex = position948, tokenIndex948 } if buffer[position] != rune(')') { - goto l938 + goto l943 } position++ - goto l939 - l938: - position, tokenIndex = position938, tokenIndex938 + goto l944 + l943: + position, tokenIndex = position943, tokenIndex943 } - l939: - goto l929 - l935: - position, tokenIndex = position929, tokenIndex929 + l944: + goto l934 + l940: + position, tokenIndex = position934, tokenIndex934 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l944 + goto l949 } position++ - l945: + l950: { - position946, tokenIndex946 := position, tokenIndex + position951, tokenIndex951 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l946 + goto l951 } position++ - goto l945 - l946: - position, tokenIndex = position946, tokenIndex946 + goto l950 + l951: + position, tokenIndex = position951, tokenIndex951 } { - position947, tokenIndex947 := position, tokenIndex + position952, tokenIndex952 := position, tokenIndex if !_rules[ruleOffsetOperator]() { - goto l947 + goto l952 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l947 + goto l952 } position++ - l949: + l954: { - position950, tokenIndex950 := position, tokenIndex + position955, tokenIndex955 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l950 + goto l955 } position++ - goto l949 - l950: - position, tokenIndex = position950, tokenIndex950 + goto l954 + l955: + position, tokenIndex = position955, tokenIndex955 } if !_rules[ruleOffsetOperator]() { - goto l947 + goto l952 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l947 + goto l952 } position++ - l951: + l956: { - position952, tokenIndex952 := position, tokenIndex + position957, tokenIndex957 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l952 + goto l957 } position++ - goto l951 - l952: - position, tokenIndex = position952, tokenIndex952 + goto l956 + l957: + position, tokenIndex = position957, tokenIndex957 } - goto l948 - l947: - position, tokenIndex = position947, tokenIndex947 + goto l953 + l952: + position, tokenIndex = position952, tokenIndex952 } - l948: - goto l929 - l944: - position, tokenIndex = position929, tokenIndex929 + l953: + goto l934 + l949: + position, tokenIndex = position934, tokenIndex934 if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l953 + goto l958 } position++ - l954: + l959: { - position955, tokenIndex955 := position, tokenIndex + position960, tokenIndex960 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l955 + goto l960 } position++ - goto l954 - l955: - position, tokenIndex = position955, tokenIndex955 + goto l959 + l960: + position, tokenIndex = position960, tokenIndex960 } { - position956, tokenIndex956 := position, tokenIndex + position961, tokenIndex961 := position, tokenIndex if !_rules[ruleOffsetOperator]() { - goto l956 + goto l961 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l956 + goto l961 } position++ - l958: + l963: { - position959, tokenIndex959 := position, tokenIndex + position964, tokenIndex964 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l959 + goto l964 } position++ - goto l958 - l959: - position, tokenIndex = position959, tokenIndex959 + goto l963 + l964: + position, tokenIndex = position964, tokenIndex964 } - goto l957 - l956: - position, tokenIndex = position956, tokenIndex956 + goto l962 + l961: + position, tokenIndex = position961, tokenIndex961 } - l957: - goto l929 - l953: - position, tokenIndex = position929, tokenIndex929 + l962: + goto l934 + l958: + position, tokenIndex = position934, tokenIndex934 if buffer[position] != rune('(') { - goto l960 + goto l965 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l960 + goto l965 } position++ - l961: + l966: { - position962, tokenIndex962 := position, tokenIndex + position967, tokenIndex967 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l962 + goto l967 } position++ - goto l961 - l962: - position, tokenIndex = position962, tokenIndex962 + goto l966 + l967: + position, tokenIndex = position967, tokenIndex967 } { - position963, tokenIndex963 := position, tokenIndex + position968, tokenIndex968 := position, tokenIndex if !_rules[ruleWS]() { - goto l963 + goto l968 } - goto l964 - l963: - position, tokenIndex = position963, tokenIndex963 + goto l969 + l968: + position, tokenIndex = position968, tokenIndex968 } - l964: + l969: if !_rules[ruleOffsetOperator]() { - goto l960 + goto l965 } { - position965, tokenIndex965 := position, tokenIndex + position970, tokenIndex970 := position, tokenIndex if !_rules[ruleWS]() { - goto l965 + goto l970 } - goto l966 - l965: - position, tokenIndex = position965, tokenIndex965 + goto l971 + l970: + position, tokenIndex = position970, tokenIndex970 } - l966: + l971: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l960 + goto l965 } position++ - l967: + l972: { - position968, tokenIndex968 := position, tokenIndex + position973, tokenIndex973 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l968 + goto l973 } position++ - goto l967 - l968: - position, tokenIndex = position968, tokenIndex968 + goto l972 + l973: + position, tokenIndex = position973, tokenIndex973 } if buffer[position] != rune(')') { - goto l960 + goto l965 } position++ if !_rules[ruleOffsetOperator]() { - goto l960 + goto l965 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l960 + goto l965 } position++ - l969: + l974: { - position970, tokenIndex970 := position, tokenIndex + position975, tokenIndex975 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l970 + goto l975 } position++ - goto l969 - l970: - position, tokenIndex = position970, tokenIndex970 + goto l974 + l975: + position, tokenIndex = position975, tokenIndex975 } if !_rules[ruleOffsetOperator]() { - goto l960 + goto l965 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l960 + goto l965 } position++ - l971: + l976: { - position972, tokenIndex972 := position, tokenIndex + position977, tokenIndex977 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l972 + goto l977 } position++ - goto l971 - l972: - position, tokenIndex = position972, tokenIndex972 + goto l976 + l977: + position, tokenIndex = position977, tokenIndex977 } - goto l929 - l960: - position, tokenIndex = position929, tokenIndex929 + goto l934 + l965: + position, tokenIndex = position934, tokenIndex934 if buffer[position] != rune('(') { - goto l973 + goto l978 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l973 + goto l978 } position++ - l974: + l979: { - position975, tokenIndex975 := position, tokenIndex + position980, tokenIndex980 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l975 + goto l980 } position++ - goto l974 - l975: - position, tokenIndex = position975, tokenIndex975 + goto l979 + l980: + position, tokenIndex = position980, tokenIndex980 } { - position976, tokenIndex976 := position, tokenIndex + position981, tokenIndex981 := position, tokenIndex if !_rules[ruleWS]() { - goto l976 + goto l981 } - goto l977 - l976: - position, tokenIndex = position976, tokenIndex976 + goto l982 + l981: + position, tokenIndex = position981, tokenIndex981 } - l977: + l982: if !_rules[ruleOffsetOperator]() { - goto l973 + goto l978 } { - position978, tokenIndex978 := position, tokenIndex + position983, tokenIndex983 := position, tokenIndex if !_rules[ruleWS]() { - goto l978 + goto l983 } - goto l979 - l978: - position, tokenIndex = position978, tokenIndex978 + goto l984 + l983: + position, tokenIndex = position983, tokenIndex983 } - l979: + l984: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l973 + goto l978 } position++ - l980: + l985: { - position981, tokenIndex981 := position, tokenIndex + position986, tokenIndex986 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l981 + goto l986 } position++ - goto l980 - l981: - position, tokenIndex = position981, tokenIndex981 + goto l985 + l986: + position, tokenIndex = position986, tokenIndex986 } if buffer[position] != rune(')') { - goto l973 + goto l978 } position++ if !_rules[ruleOffsetOperator]() { - goto l973 + goto l978 } if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l973 + goto l978 } position++ - l982: + l987: { - position983, tokenIndex983 := position, tokenIndex + position988, tokenIndex988 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l983 + goto l988 } position++ - goto l982 - l983: - position, tokenIndex = position983, tokenIndex983 + goto l987 + l988: + position, tokenIndex = position988, tokenIndex988 } { - position984, tokenIndex984 := position, tokenIndex + position989, tokenIndex989 := position, tokenIndex if buffer[position] != rune('x') { - goto l984 + goto l989 } position++ - goto l973 - l984: - position, tokenIndex = position984, tokenIndex984 + goto l978 + l989: + position, tokenIndex = position989, tokenIndex989 } - goto l929 - l973: - position, tokenIndex = position929, tokenIndex929 + goto l934 + l978: + position, tokenIndex = position934, tokenIndex934 if buffer[position] != rune('(') { - goto l985 + goto l990 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l985 + goto l990 } position++ - l986: + l991: { - position987, tokenIndex987 := position, tokenIndex + position992, tokenIndex992 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l987 + goto l992 } position++ - goto l986 - l987: - position, tokenIndex = position987, tokenIndex987 + goto l991 + l992: + position, tokenIndex = position992, tokenIndex992 } { - position988, tokenIndex988 := position, tokenIndex + position993, tokenIndex993 := position, tokenIndex if !_rules[ruleWS]() { - goto l988 + goto l993 } - goto l989 - l988: - position, tokenIndex = position988, tokenIndex988 + goto l994 + l993: + position, tokenIndex = position993, tokenIndex993 } - l989: + l994: if !_rules[ruleOffsetOperator]() { - goto l985 + goto l990 } { - position990, tokenIndex990 := position, tokenIndex + position995, tokenIndex995 := position, tokenIndex if !_rules[ruleWS]() { - goto l990 + goto l995 } - goto l991 - l990: - position, tokenIndex = position990, tokenIndex990 + goto l996 + l995: + position, tokenIndex = position995, tokenIndex995 } - l991: + l996: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l985 + goto l990 } position++ - l992: + l997: { - position993, tokenIndex993 := position, tokenIndex + position998, tokenIndex998 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l993 + goto l998 } position++ - goto l992 - l993: - position, tokenIndex = position993, tokenIndex993 + goto l997 + l998: + position, tokenIndex = position998, tokenIndex998 } if buffer[position] != rune(')') { - goto l985 + goto l990 } position++ - goto l929 - l985: - position, tokenIndex = position929, tokenIndex929 + goto l934 + l990: + position, tokenIndex = position934, tokenIndex934 if buffer[position] != rune('(') { - goto l898 + goto l903 } position++ if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l898 + goto l903 } position++ - l994: + l999: { - position995, tokenIndex995 := position, tokenIndex + position1000, tokenIndex1000 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l995 + goto l1000 } position++ - goto l994 - l995: - position, tokenIndex = position995, tokenIndex995 + goto l999 + l1000: + position, tokenIndex = position1000, tokenIndex1000 } { - position996, tokenIndex996 := position, tokenIndex + position1001, tokenIndex1001 := position, tokenIndex if !_rules[ruleWS]() { - goto l996 + goto l1001 } - goto l997 - l996: - position, tokenIndex = position996, tokenIndex996 + goto l1002 + l1001: + position, tokenIndex = position1001, tokenIndex1001 } - l997: + l1002: if !_rules[ruleOffsetOperator]() { - goto l898 + goto l903 } { - position998, tokenIndex998 := position, tokenIndex + position1003, tokenIndex1003 := position, tokenIndex if !_rules[ruleWS]() { - goto l998 + goto l1003 } - goto l999 - l998: - position, tokenIndex = position998, tokenIndex998 + goto l1004 + l1003: + position, tokenIndex = position1003, tokenIndex1003 } - l999: + l1004: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l898 + goto l903 } position++ - l1000: + l1005: { - position1001, tokenIndex1001 := position, tokenIndex + position1006, tokenIndex1006 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l1001 + goto l1006 } position++ - goto l1000 - l1001: - position, tokenIndex = position1001, tokenIndex1001 + goto l1005 + l1006: + position, tokenIndex = position1006, tokenIndex1006 } { - position1002, tokenIndex1002 := position, tokenIndex + position1007, tokenIndex1007 := position, tokenIndex if !_rules[ruleWS]() { - goto l1002 + goto l1007 } - goto l1003 - l1002: - position, tokenIndex = position1002, tokenIndex1002 + goto l1008 + l1007: + position, tokenIndex = position1007, tokenIndex1007 } - l1003: + l1008: if !_rules[ruleOffsetOperator]() { - goto l898 + goto l903 } { - position1004, tokenIndex1004 := position, tokenIndex + position1009, tokenIndex1009 := position, tokenIndex if !_rules[ruleWS]() { - goto l1004 + goto l1009 } - goto l1005 - l1004: - position, tokenIndex = position1004, tokenIndex1004 + goto l1010 + l1009: + position, tokenIndex = position1009, tokenIndex1009 } - l1005: + l1010: if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l898 + goto l903 } position++ - l1006: + l1011: { - position1007, tokenIndex1007 := position, tokenIndex + position1012, tokenIndex1012 := position, tokenIndex if c := buffer[position]; c < rune('0') || c > rune('9') { - goto l1007 + goto l1012 } position++ - goto l1006 - l1007: - position, tokenIndex = position1007, tokenIndex1007 + goto l1011 + l1012: + position, tokenIndex = position1012, tokenIndex1012 } if buffer[position] != rune(')') { - goto l898 + goto l903 } position++ } - l929: + l934: { - position1008, tokenIndex1008 := position, tokenIndex + position1013, tokenIndex1013 := position, tokenIndex { - position1009, tokenIndex1009 := position, tokenIndex + position1014, tokenIndex1014 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l1010 + goto l1015 } position++ - goto l1009 - l1010: - position, tokenIndex = position1009, tokenIndex1009 + goto l1014 + l1015: + position, tokenIndex = position1014, tokenIndex1014 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l1008 + goto l1013 } position++ } - l1009: - goto l898 - l1008: - position, tokenIndex = position1008, tokenIndex1008 + l1014: + goto l903 + l1013: + position, tokenIndex = position1013, tokenIndex1013 } } - l904: - add(ruleOffset, position899) + l909: + add(ruleOffset, position904) } return true - l898: - position, tokenIndex = position898, tokenIndex898 + l903: + position, tokenIndex = position903, tokenIndex903 return false }, /* 58 Section <- <([a-z] / [A-Z] / '@')+> */ func() bool { - position1011, tokenIndex1011 := position, tokenIndex + position1016, tokenIndex1016 := position, tokenIndex { - position1012 := position + position1017 := position { - position1015, tokenIndex1015 := position, tokenIndex + position1020, tokenIndex1020 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l1016 + goto l1021 } position++ - goto l1015 - l1016: - position, tokenIndex = position1015, tokenIndex1015 + goto l1020 + l1021: + position, tokenIndex = position1020, tokenIndex1020 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l1017 + goto l1022 } position++ - goto l1015 - l1017: - position, tokenIndex = position1015, tokenIndex1015 + goto l1020 + l1022: + position, tokenIndex = position1020, tokenIndex1020 if buffer[position] != rune('@') { - goto l1011 + goto l1016 } position++ } - l1015: - l1013: + l1020: + l1018: { - position1014, tokenIndex1014 := position, tokenIndex + position1019, tokenIndex1019 := position, tokenIndex { - position1018, tokenIndex1018 := position, tokenIndex + position1023, tokenIndex1023 := position, tokenIndex if c := buffer[position]; c < rune('a') || c > rune('z') { - goto l1019 + goto l1024 } position++ - goto l1018 - l1019: - position, tokenIndex = position1018, tokenIndex1018 + goto l1023 + l1024: + position, tokenIndex = position1023, tokenIndex1023 if c := buffer[position]; c < rune('A') || c > rune('Z') { - goto l1020 + goto l1025 } position++ - goto l1018 - l1020: - position, tokenIndex = position1018, tokenIndex1018 + goto l1023 + l1025: + position, tokenIndex = position1023, tokenIndex1023 if buffer[position] != rune('@') { - goto l1014 + goto l1019 } position++ } - l1018: - goto l1013 - l1014: - position, tokenIndex = position1014, tokenIndex1014 + l1023: + goto l1018 + l1019: + position, tokenIndex = position1019, tokenIndex1019 } - add(ruleSection, position1012) + add(ruleSection, position1017) } return true - l1011: - position, tokenIndex = position1011, tokenIndex1011 + l1016: + position, tokenIndex = position1016, tokenIndex1016 return false }, /* 59 SegmentRegister <- <('%' ([c-g] / 's') ('s' ':'))> */ func() bool { - position1021, tokenIndex1021 := position, tokenIndex + position1026, tokenIndex1026 := position, tokenIndex { - position1022 := position + position1027 := position if buffer[position] != rune('%') { - goto l1021 + goto l1026 } position++ { - position1023, tokenIndex1023 := position, tokenIndex + position1028, tokenIndex1028 := position, tokenIndex if c := buffer[position]; c < rune('c') || c > rune('g') { - goto l1024 + goto l1029 } position++ - goto l1023 - l1024: - position, tokenIndex = position1023, tokenIndex1023 + goto l1028 + l1029: + position, tokenIndex = position1028, tokenIndex1028 if buffer[position] != rune('s') { - goto l1021 + goto l1026 } position++ } - l1023: + l1028: if buffer[position] != rune('s') { - goto l1021 + goto l1026 } position++ if buffer[position] != rune(':') { - goto l1021 + goto l1026 } position++ - add(ruleSegmentRegister, position1022) + add(ruleSegmentRegister, position1027) } return true - l1021: - position, tokenIndex = position1021, tokenIndex1021 + l1026: + position, tokenIndex = position1026, tokenIndex1026 return false }, }