From 319fc6a2f206b6a67508dd60b4424ce1a54bcb99 Mon Sep 17 00:00:00 2001 From: Robin Salen <30937548+Nashtare@users.noreply.github.com> Date: Mon, 22 Jan 2024 08:02:38 -0500 Subject: [PATCH] Improve SHA2 precompile (#1480) * Improve SHA2 precompile * Review * Add removed global label --- .../cpu/kernel/asm/hash/sha2/compression.asm | 9 +-- evm/src/cpu/kernel/asm/hash/sha2/main.asm | 23 +++---- .../kernel/asm/hash/sha2/message_schedule.asm | 62 +++++++++++-------- .../cpu/kernel/asm/hash/sha2/write_length.asm | 9 ++- evm/src/cpu/kernel/asm/memory/core.asm | 23 +++++++ 5 files changed, 81 insertions(+), 45 deletions(-) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm index 5e1ff1f30a..f25ff30229 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/compression.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/compression.asm @@ -4,6 +4,9 @@ // stack: num_blocks %mul_const(320) %add_const(2) + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address %endmacro global sha2_compression: @@ -24,9 +27,7 @@ global sha2_compression: // stack: i=0, message_schedule_addr, a[0]..h[0], retdest SWAP1 // stack: message_schedule_addr, i=0, a[0]..h[0], retdest - PUSH 0 - // stack: 0, message_schedule_addr, i=0, a[0]..h[0], retdest - %mload_current_general + %mload_current_general_no_offset // stack: num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest DUP1 // stack: num_blocks, num_blocks, message_schedule_addr, i=0, a[0]..h[0], retdest @@ -53,7 +54,7 @@ compression_loop: // stack: 4*i, message_schedule_addr, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest ADD // stack: message_schedule_addr + 4*i, a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest - %mload_current_general_u32 + %mload_u32 // stack: W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest PUSH sha2_constants_k // stack: sha2_constants_k, W[i], a[i], b[i], c[i], d[i], e[i], f[i], g[i], h[i], num_blocks, scratch_space_addr, message_schedule_addr, i, a[0]..h[0], retdest diff --git a/evm/src/cpu/kernel/asm/hash/sha2/main.asm b/evm/src/cpu/kernel/asm/hash/sha2/main.asm index 81ec391293..039379f39a 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/main.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/main.asm @@ -1,20 +1,22 @@ global sha2: // stack: virt, num_bytes, retdest - SWAP1 - // stack: num_bytes, virt, retdest - DUP2 - // stack: virt, num_bytes, virt, retdest - %mstore_current_general - // stack: virt, retdest + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address + // stack: addr, num_bytes, retdest + DUP1 SWAP2 + // stack: num_bytes, addr, addr, retdest + MSTORE_GENERAL + // stack: addr, retdest -// Precodition: input is in memory, starting at virt of kernel general segment, of the form +// Precondition: input is in memory, starting at addr of kernel general segment, of the form // num_bytes, x[0], x[1], ..., x[num_bytes - 1] // Postcodition: output is in memory, starting at 0, of the form // num_blocks, block0[0], ..., block0[63], block1[0], ..., blocklast[63] global sha2_pad: - // stack: virt, retdest - %mload_current_general + // stack: addr, retdest + MLOAD_GENERAL // stack: num_bytes, retdest // STEP 1: append 1 // insert 128 (= 1 << 7) at x[num_bytes+1] @@ -50,8 +52,7 @@ global sha2_pad: DUP1 // stack: num_blocks, num_blocks, retdest // STEP 5: write num_blocks to x[0] - PUSH 0 - %mstore_current_general + %mstore_current_general_no_offset // stack: num_blocks, retdest %message_schedule_addr_from_num_blocks %jump(sha2_gen_all_message_schedules) diff --git a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm index c9f542ce5f..b789a7fbb8 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/message_schedule.asm @@ -3,9 +3,12 @@ // stack: num_blocks %mul_const(64) %add_const(2) + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address %endmacro -// Precodition: stack contains address of one message block, followed by output address +// Precondition: stack contains address of one message block, followed by output address // Postcondition: 256 bytes starting at given output address contain the 64 32-bit chunks // of message schedule (in four-byte increments) gen_message_schedule_from_block: @@ -16,18 +19,17 @@ gen_message_schedule_from_block: // stack: block_addr + 32, block_addr, output_addr, retdest SWAP1 // stack: block_addr, block_addr + 32, output_addr, retdest - %mload_current_general_u256 + %mload_u256 // stack: block[0], block_addr + 32, output_addr, retdest SWAP1 // stack: block_addr + 32, block[0], output_addr, retdest - %mload_current_general_u256 + %mload_u256 // stack: block[1], block[0], output_addr, retdest SWAP2 // stack: output_addr, block[0], block[1], retdest %add_const(28) PUSH 8 // stack: counter=8, output_addr + 28, block[0], block[1], retdest - %jump(gen_message_schedule_from_block_0_loop) gen_message_schedule_from_block_0_loop: // Split the first half (256 bits) of the block into the first eight (32-bit) chunks of the message sdchedule. // stack: counter, output_addr, block[0], block[1], retdest @@ -43,7 +45,7 @@ gen_message_schedule_from_block_0_loop: // stack: block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest DUP3 // stack: output_addr, block[0] % (1 << 32), block[0] >> 32, output_addr, counter, block[1], retdest - %mstore_current_general_u32 + %mstore_u32 // stack: block[0] >> 32, output_addr, counter, block[1], retdest SWAP1 // stack: output_addr, block[0] >> 32, counter, block[1], retdest @@ -81,7 +83,7 @@ gen_message_schedule_from_block_1_loop: // stack: block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest DUP3 // stack: output_addr, block[1] % (1 << 32), block[1] >> 32, output_addr, counter, block[0], retdest - %mstore_current_general_u32 + %mstore_u32 // stack: block[1] >> 32, output_addr, counter, block[0], retdest SWAP1 // stack: output_addr, block[1] >> 32, counter, block[0], retdest @@ -111,39 +113,43 @@ gen_message_schedule_remaining_loop: // stack: counter, output_addr, block[0], block[1], retdest SWAP1 // stack: output_addr, counter, block[0], block[1], retdest - DUP1 - // stack: output_addr, output_addr, counter, block[0], block[1], retdest - %sub_const(8) + PUSH 8 + DUP2 + // stack: output_addr, 2*4, output_addr, counter, block[0], block[1], retdest + SUB // stack: output_addr - 2*4, output_addr, counter, block[0], block[1], retdest - %mload_current_general_u32 + %mload_u32 // stack: x[output_addr - 2*4], output_addr, counter, block[0], block[1], retdest %sha2_sigma_1 // stack: sigma_1(x[output_addr - 2*4]), output_addr, counter, block[0], block[1], retdest SWAP1 // stack: output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - DUP1 - // stack: output_addr, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %sub_const(28) + PUSH 28 + DUP2 + // stack: output_addr, 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest + SUB // stack: output_addr - 7*4, output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %mload_current_general_u32 + %mload_u32 // stack: x[output_addr - 7*4], output_addr, sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest SWAP1 // stack: output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - DUP1 - // stack: output_addr, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %sub_const(60) + PUSH 60 + DUP2 + // stack: output_addr, 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest + SUB // stack: output_addr - 15*4, output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %mload_current_general_u32 + %mload_u32 // stack: x[output_addr - 15*4], output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest %sha2_sigma_0 // stack: sigma_0(x[output_addr - 15*4]), output_addr, x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest SWAP1 // stack: output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - DUP1 - // stack: output_addr, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %sub_const(64) + PUSH 64 + DUP2 + // stack: output_addr, 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest + SUB // stack: output_addr - 16*4, output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest - %mload_current_general_u32 + %mload_u32 // stack: x[output_addr - 16*4], output_addr, sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest SWAP1 // stack: output_addr, x[output_addr - 16*4], sigma_0(x[output_addr - 15*4]), x[output_addr - 7*4], sigma_1(x[output_addr - 2*4]), counter, block[0], block[1], retdest @@ -155,7 +161,7 @@ gen_message_schedule_remaining_loop: // stack: sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest DUP2 // stack: output_addr, sigma_1(x[output_addr - 2*4]) + x[output_addr - 16*4] + sigma_0(x[output_addr - 15*4]) + x[output_addr - 7*4], output_addr, counter, block[0], block[1], retdest - %mstore_current_general_u32 + %mstore_u32 // stack: output_addr, counter, block[0], block[1], retdest %add_const(4) // stack: output_addr + 4, counter, block[0], block[1], retdest @@ -178,12 +184,14 @@ global sha2_gen_all_message_schedules: // stack: output_addr, retdest DUP1 // stack: output_addr, output_addr, retdest - PUSH 0 - // stack: 0, output_addr, output_addr, retdest - %mload_current_general + %mload_current_general_no_offset // stack: num_blocks, output_addr, output_addr, retdest PUSH 1 - // stack: cur_addr = 1, counter = num_blocks, output_addr, output_addr, retdest + // stack: cur_offset = 1, counter = num_blocks, output_addr, output_addr, retdest + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address + // stack: cur_addr, counter, output_addr, output_addr, retdest gen_all_message_schedules_loop: // stack: cur_addr, counter, cur_output_addr, output_addr, retdest PUSH gen_all_message_schedules_loop_end diff --git a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm index c412a666ff..c9a0642fa2 100644 --- a/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm +++ b/evm/src/cpu/kernel/asm/hash/sha2/write_length.asm @@ -1,5 +1,8 @@ %macro sha2_write_length - // stack: last_addr, length + // stack: last_addr_offset, length + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address SWAP1 // stack: length, last_addr DUP1 @@ -8,7 +11,7 @@ // stack: length % (1 << 8), length, last_addr DUP3 // stack: last_addr, length % (1 << 8), length, last_addr - %mstore_current_general + %swap_mstore %rep 7 // For i = 0 to 6 @@ -26,7 +29,7 @@ // stack: (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2 DUP3 // stack: last_addr - i - 2, (length >> (8 * (i + 1))) % (1 << 8), length >> (8 * (i + 1)), last_addr - i - 2 - %mstore_current_general + %swap_mstore %endrep %pop2 diff --git a/evm/src/cpu/kernel/asm/memory/core.asm b/evm/src/cpu/kernel/asm/memory/core.asm index da8a05fb18..dad5979f22 100644 --- a/evm/src/cpu/kernel/asm/memory/core.asm +++ b/evm/src/cpu/kernel/asm/memory/core.asm @@ -124,6 +124,16 @@ // stack: value %endmacro +// Load a single value from the kernel general memory, in the current context (not the kernel's context). +%macro mload_current_general_no_offset + // stack: + PUSH @SEGMENT_KERNEL_GENERAL + GET_CONTEXT + %build_address_no_offset + MLOAD_GENERAL + // stack: value +%endmacro + // Load a big-endian u32 from kernel general memory in the current context. %macro mload_current_general_u32 // stack: offset @@ -185,6 +195,19 @@ // stack: (empty) %endmacro +// Store a single value to kernel general memory in the current context. +%macro mstore_current_general_no_offset + // stack: value + PUSH @SEGMENT_KERNEL_GENERAL + // stack: segment, value + GET_CONTEXT + // stack: context, segment, value + %build_address_no_offset + SWAP1 + MSTORE_GENERAL + // stack: (empty) +%endmacro + %macro mstore_current_general(offset) // stack: value PUSH $offset