From 3182c22e16a1e68ad7008a1296f2271f43c0485e Mon Sep 17 00:00:00 2001 From: evoskuil Date: Fri, 6 Dec 2024 01:28:45 -0500 Subject: [PATCH] Single block shani. --- include/bitcoin/system/hash/sha/algorithm.hpp | 30 +++- .../system/impl/hash/sha/algorithm_double.ipp | 21 ++- .../impl/hash/sha/algorithm_iterate.ipp | 12 +- .../system/impl/hash/sha/algorithm_native.ipp | 170 +++++++++++------- .../impl/hash/sha/algorithm_padding.ipp | 91 +++++++--- .../system/impl/hash/sha/algorithm_single.ipp | 117 +++++++++--- .../system/impl/hash/sha/algorithm_stream.ipp | 6 +- 7 files changed, 313 insertions(+), 134 deletions(-) diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index 5b25bfac1c..3c11a9c058 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -166,6 +166,9 @@ class algorithm template = true> using wstate_t = std_array; + ////template = true> + ////using wblock_t = std_array; + /// Other types. /// ----------------------------------------------------------------------- @@ -236,17 +239,23 @@ class algorithm /// Padding. /// ----------------------------------------------------------------------- + /// Scheduled padding (new and existing buffer objects). template static CONSTEVAL buffer_t scheduled_pad() NOEXCEPT; - static CONSTEVAL chunk_t chunk_pad() NOEXCEPT; - static CONSTEVAL pad_t stream_pad() NOEXCEPT; - template static constexpr void schedule_n(buffer_t& buffer) NOEXCEPT; static constexpr void schedule_n(buffer_t& buffer, size_t blocks) NOEXCEPT; static constexpr void schedule_1(buffer_t& buffer) NOEXCEPT; - static constexpr void pad_half(buffer_t& buffer) NOEXCEPT; - static constexpr void pad_n(buffer_t& buffer, count_t blocks) NOEXCEPT; + + /// Unscheduled padding (new objects). + static words_t pad_block() NOEXCEPT; + static words_t pad_blocks(count_t blocks) NOEXCEPT; + static CONSTEVAL chunk_t chunk_pad() NOEXCEPT; + static CONSTEVAL pad_t stream_pad() NOEXCEPT; + + /// Unscheduled padding (update block or buffer object). + static constexpr void pad_half(auto& buffer) NOEXCEPT; + static constexpr void pad_n(auto& buffer, count_t blocks) NOEXCEPT; /// Double hashing. /// ----------------------------------------------------------------------- @@ -363,6 +372,8 @@ class algorithm /// Native SHA optimizations (single blocks). /// ----------------------------------------------------------------------- + template + INLINE static xint128_t bytes(xint128_t message) NOEXCEPT; INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT; @@ -373,7 +384,14 @@ class algorithm INLINE static void round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT; - static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT; + template + INLINE static void native_rounds(xint128_t& lo, xint128_t& hi, + const block_t& block) NOEXCEPT; + + static void native_(state_t& state, iblocks_t& blocks) NOEXCEPT; + static void native_(state_t& state, const block_t& block) NOEXCEPT; + INLINE static void native_preswapped(state_t& state, + const words_t& block) NOEXCEPT; public: /// Summary public values. diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_double.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_double.ipp index 668e2c1c8e..cf1be17fd7 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_double.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_double.ipp @@ -65,9 +65,10 @@ double_hash(const ablocks_t& blocks) NOEXCEPT { static_assert(is_same_type); - buffer_t buffer{}; auto state = H::get; iterate(state, blocks); + + buffer_t buffer{}; schedule_n(buffer); compress(state, buffer); @@ -77,6 +78,7 @@ double_hash(const ablocks_t& blocks) NOEXCEPT schedule(buffer); state = H::get; compress(state, buffer); + return output(state); } @@ -89,9 +91,10 @@ double_hash(iblocks_t&& blocks) NOEXCEPT // Save block count, as iterable decrements. const auto count = blocks.size(); - buffer_t buffer{}; auto state = H::get; iterate(state, blocks); + + buffer_t buffer{}; schedule_n(buffer, count); compress(state, buffer); @@ -101,6 +104,7 @@ double_hash(iblocks_t&& blocks) NOEXCEPT schedule(buffer); state = H::get; compress(state, buffer); + return output(state); } @@ -110,9 +114,9 @@ double_hash(const block_t& block) NOEXCEPT { static_assert(is_same_type); - buffer_t buffer{}; - auto state = H::get; + + buffer_t buffer{}; input(buffer, block); schedule(buffer); compress(state, buffer); @@ -125,6 +129,7 @@ double_hash(const block_t& block) NOEXCEPT schedule(buffer); state = H::get; compress(state, buffer); + return output(state); } @@ -134,8 +139,9 @@ double_hash(const half_t& half) NOEXCEPT { static_assert(is_same_type); - buffer_t buffer{}; auto state = H::get; + + buffer_t buffer{}; input_left(buffer, half); pad_half(buffer); schedule(buffer); @@ -147,6 +153,7 @@ double_hash(const half_t& half) NOEXCEPT schedule(buffer); state = H::get; compress(state, buffer); + return output(state); } @@ -156,8 +163,9 @@ double_hash(const half_t& left, const half_t& right) NOEXCEPT { static_assert(is_same_type); - buffer_t buffer{}; auto state = H::get; + + buffer_t buffer{}; input_left(buffer, left); input_right(buffer, right); schedule(buffer); @@ -171,6 +179,7 @@ double_hash(const half_t& left, const half_t& right) NOEXCEPT schedule(buffer); state = H::get; compress(state, buffer); + return output(state); } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index bb51e8e9ce..17eff4e206 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -243,19 +243,19 @@ iterate_vector(state_t& state, const ablocks_t& blocks) NOEXCEPT // intel-sha-extensions-white-paper-402097.pdf TEMPLATE +template INLINE void CLASS:: -iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT +iterate_native(state_t& state, const ablocks_t& blocks) NOEXCEPT { - native_rounds(state, blocks); + iblocks_t iblocks{ array_cast(blocks) }; + native_(state, iblocks); } TEMPLATE -template INLINE void CLASS:: -iterate_native(state_t& state, const ablocks_t& blocks) NOEXCEPT +iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT { - iblocks_t iblocks{ array_cast(blocks) }; - native_rounds(state, iblocks); + native_(state, blocks); } // Dispatch and normal forms. diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 79f7a34898..dd6277ca11 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -34,10 +34,22 @@ namespace sha { // ---------------------------------------------------------------------------- // protected +TEMPLATE +template +INLINE xint128_t CLASS:: +bytes(xint128_t message) NOEXCEPT +{ + if constexpr (Swap) + return byteswap(message); + else + return message; +} + TEMPLATE INLINE void CLASS:: shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT { + // shuffle organizes state as expected by sha256rnds2. const auto shuffle0 = mm_shuffle_epi32(state0, 0xb1); const auto shuffle1 = mm_shuffle_epi32(state1, 0x1b); state0 = mm_alignr_epi8(shuffle0, shuffle1, 0x08); @@ -48,6 +60,7 @@ TEMPLATE INLINE void CLASS:: unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT { + // unshuffle restores state to normal form. const auto shuffle0 = mm_shuffle_epi32(state0, 0x1b); const auto shuffle1 = mm_shuffle_epi32(state1, 0xb1); state0 = mm_blend_epi16(shuffle0, shuffle1, 0xf0); @@ -84,91 +97,124 @@ round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT } TEMPLATE -void CLASS:: -native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT +template +INLINE void CLASS:: +native_rounds(xint128_t& lo, xint128_t& hi, const block_t& block) NOEXCEPT { - // Individual state vars are used vs. array to ensure register persistence. - auto& wstate = array_cast(state); - auto lo = load(wstate[0]); - auto hi = load(wstate[1]); + const auto& wblock = array_cast(block); - // shuffle organizes state as expected by sha256rnds2. - shuffle(lo, hi); + const auto start_lo = lo; + const auto start_hi = hi; + + auto message0 = bytes(load(wblock[0])); + round_4<0>(lo, hi, message0); - while (!blocks.empty()) - { - const auto start_lo = lo; - const auto start_hi = hi; - const auto& wblock = array_cast(blocks.to_array()); + auto message1 = bytes(load(wblock[1])); + round_4<1>(lo, hi, message1); - auto message0 = byteswap(load(wblock[0])); - round_4<0>(lo, hi, message0); + prepare(message0, message1); + auto message2 = bytes(load(wblock[2])); + round_4<2>(lo, hi, message2); - auto message1 = byteswap(load(wblock[1])); - round_4<1>(lo, hi, message1); + prepare(message1, message2); + auto message3 = bytes(load(wblock[3])); + round_4<3>(lo, hi, message3); - prepare(message0, message1); - auto message2 = byteswap(load(wblock[2])); - round_4<2>(lo, hi, message2); + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<4>(lo, hi, message0); - prepare(message1, message2); - auto message3 = byteswap(load(wblock[3])); - round_4<3>(lo, hi, message3); + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<5>(lo, hi, message1); - prepare(message2, message3, message0); - prepare(message2, message3); - round_4<4>(lo, hi, message0); + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<6>(lo, hi, message2); - prepare(message3, message0, message1); - prepare(message3, message0); - round_4<5>(lo, hi, message1); + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<7>(lo, hi, message3); - prepare(message0, message1, message2); - prepare(message0, message1); - round_4<6>(lo, hi, message2); + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<8>(lo, hi, message0); - prepare(message1, message2, message3); - prepare(message1, message2); - round_4<7>(lo, hi, message3); + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<9>(lo, hi, message1); - prepare(message2, message3, message0); - prepare(message2, message3); - round_4<8>(lo, hi, message0); + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<10>(lo, hi, message2); - prepare(message3, message0, message1); - prepare(message3, message0); - round_4<9>(lo, hi, message1); + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<11>(lo, hi, message3); - prepare(message0, message1, message2); - prepare(message0, message1); - round_4<10>(lo, hi, message2); + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<12>(lo, hi, message0); - prepare(message1, message2, message3); - prepare(message1, message2); - round_4<11>(lo, hi, message3); + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<13>(lo, hi, message1); - prepare(message2, message3, message0); - prepare(message2, message3); - round_4<12>(lo, hi, message0); + prepare(message0, message1, message2); + round_4<14>(lo, hi, message2); - prepare(message3, message0, message1); - prepare(message3, message0); - round_4<13>(lo, hi, message1); + prepare(message1, message2, message3); + round_4<15>(lo, hi, message3); - prepare(message0, message1, message2); - round_4<14>(lo, hi, message2); + lo = add(lo, start_lo); + hi = add(hi, start_hi); +} - prepare(message1, message2, message3); - round_4<15>(lo, hi, message3); +TEMPLATE +void CLASS:: +native_(state_t& state, iblocks_t& blocks) NOEXCEPT +{ + // Individual state vars are used vs. array to ensure register persistence. + auto& wstate = array_cast(state); + auto lo = load(wstate[0]); + auto hi = load(wstate[1]); + shuffle(lo, hi); - lo = add(lo, start_lo); - hi = add(hi, start_hi); - blocks.advance(); - } + for (auto& block : blocks) + native_rounds(lo, hi, block); - // unshuffle restores state to normal form. unshuffle(lo, hi); + store(wstate[0], lo); + store(wstate[1], hi); +} +TEMPLATE +void CLASS:: +native_(state_t& state, const block_t& block) NOEXCEPT +{ + auto& wstate = array_cast(state); + auto lo = load(wstate[0]); + auto hi = load(wstate[1]); + shuffle(lo, hi); + native_rounds(lo, hi, block); + unshuffle(lo, hi); + store(wstate[0], lo); + store(wstate[1], hi); +} + +TEMPLATE +INLINE void CLASS:: +native_preswapped(state_t& state, const words_t& block) NOEXCEPT +{ + auto& wstate = array_cast(state); + auto lo = load(wstate[0]); + auto hi = load(wstate[1]); + shuffle(lo, hi); + + // This override is for padding (big-endian, preswapped data). + native_rounds(lo, hi, array_cast(block)); + + unshuffle(lo, hi); store(wstate[0], lo); store(wstate[1], hi); } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp index 287049fc45..9159c7c260 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp @@ -23,12 +23,15 @@ // 5.1 Padding the Message // ============================================================================ +// All elements big endian (normal form) because endianness is on input. +// However for block padding endianness is performed in algorithm. So either +// must reverse that here and undo in algorithm, or bypass in algorithm. namespace libbitcoin { namespace system { namespace sha { -// protected +// Scheduled padding (new and existing buffer objects). // ---------------------------------------------------------------------------- TEMPLATE @@ -40,8 +43,8 @@ scheduled_pad() NOEXCEPT static_assert(Blocks <= maximum / byte_bits); // See comments in accumulator regarding padding endianness. - constexpr auto index = sub1(array_count); constexpr auto bytes = safe_multiply(Blocks, array_count); + constexpr auto index = sub1(array_count); buffer_t out{}; out.front() = bit_hi; @@ -50,29 +53,6 @@ scheduled_pad() NOEXCEPT return out; } -TEMPLATE -CONSTEVAL typename CLASS::chunk_t CLASS:: -chunk_pad() NOEXCEPT -{ - // See comments in accumulator regarding padding endianness. - constexpr auto bytes = possible_narrow_cast(array_count); - - chunk_t out{}; - out.front() = bit_hi; - out.back() = to_bits(bytes); - return out; -} - -TEMPLATE -CONSTEVAL typename CLASS::pad_t CLASS:: -stream_pad() NOEXCEPT -{ - // See comments in accumulator regarding padding endianness. - pad_t out{}; - out.front() = bit_hi; - return out; -} - TEMPLATE template constexpr void CLASS:: @@ -129,9 +109,64 @@ schedule_1(buffer_t& buffer) NOEXCEPT schedule_n(buffer); } +// Unscheduled padding (new objects). +// ---------------------------------------------------------------------------- + +TEMPLATE +typename CLASS::words_t CLASS:: +pad_blocks(count_t blocks) NOEXCEPT +{ + // Pad any number of whole blocks. + const auto bits = to_bits(blocks * array_count); + constexpr auto pad = stream_pad(); + + words_t block{}; + array_cast>(block) = pad; + + // Split count into hi/low words and assign end of padded block. + block[14] = hi_word(bits); + block[15] = lo_word(bits); + return block; +} + +TEMPLATE +typename CLASS::words_t CLASS:: +pad_block() NOEXCEPT +{ + return pad_blocks(one); +} + +TEMPLATE +CONSTEVAL typename CLASS::chunk_t CLASS:: +chunk_pad() NOEXCEPT +{ + // See comments in accumulator regarding padding endianness. + constexpr auto bytes = possible_narrow_cast(array_count); + + chunk_t out{}; + out.front() = bit_hi; + out.back() = to_bits(bytes); + return out; +} + +TEMPLATE +CONSTEVAL typename CLASS::pad_t CLASS:: +stream_pad() NOEXCEPT +{ + // See comments in accumulator regarding padding endianness. + pad_t out{}; + out.front() = bit_hi; + + // Size is not set. + return out; +} + +// Unscheduled padding (update block or buffer object). +// ---------------------------------------------------------------------------- + TEMPLATE constexpr void CLASS:: -pad_half(buffer_t& buffer) NOEXCEPT +pad_half(auto& buffer) NOEXCEPT { // Pad for any half block, unscheduled buffer. constexpr auto pad = chunk_pad(); @@ -155,11 +190,11 @@ pad_half(buffer_t& buffer) NOEXCEPT TEMPLATE constexpr void CLASS:: -pad_n(buffer_t& buffer, count_t blocks) NOEXCEPT +pad_n(auto& buffer, count_t blocks) NOEXCEPT { // Pad any number of whole blocks, unscheduled buffer. - constexpr auto pad = stream_pad(); const auto bits = to_bits(blocks * array_count); + constexpr auto pad = stream_pad(); if (std::is_constant_evaluated()) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_single.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_single.ipp index 76d8d83ff6..ee29582ce0 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_single.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_single.ipp @@ -34,49 +34,116 @@ template constexpr typename CLASS::digest_t CLASS:: hash(const ablocks_t& blocks) NOEXCEPT { - buffer_t buffer{}; - auto state = H::get; - iterate(state, blocks); - schedule_n(buffer); - compress(state, buffer); - return output(state); + if (std::is_constant_evaluated()) + { + auto state = H::get; + iterate(state, blocks); + buffer_t buffer{}; + schedule_n(buffer); + compress(state, buffer); + return output(state); + } + else if constexpr (native && SHA::strength == 256) + { + auto state = H::get; + iterate(state, blocks); + native_preswapped(state, pad_blocks(Size)); + return output(state); + } + else + { + auto state = H::get; + iterate(state, blocks); + buffer_t buffer{}; + schedule_n(buffer); + compress(state, buffer); + return output(state); + } } TEMPLATE typename CLASS::digest_t CLASS:: hash(iblocks_t&& blocks) NOEXCEPT { - // Save block count, as iterable decrements. - const auto count = blocks.size(); - - buffer_t buffer{}; - auto state = H::get; - iterate(state, blocks); - schedule_n(buffer, count); - compress(state, buffer); - return output(state); + if constexpr (native && SHA::strength == 256) + { + // Save block count, as iterable decrements. + const auto count = blocks.size(); + auto state = H::get; + iterate(state, blocks); + native_preswapped(state, pad_blocks(count)); + return output(state); + } + else + { + // Save block count, as iterable decrements. + const auto count = blocks.size(); + auto state = H::get; + iterate(state, blocks); + buffer_t buffer{}; + schedule_n(buffer, count); + compress(state, buffer); + return output(state); + } } TEMPLATE constexpr typename CLASS::digest_t CLASS:: hash(const block_t& block) NOEXCEPT { - buffer_t buffer{}; - auto state = H::get; - input(buffer, block); - schedule(buffer); - compress(state, buffer); - schedule_1(buffer); - compress(state, buffer); - return output(state); + if (std::is_constant_evaluated()) + { + auto state = H::get; + buffer_t buffer{}; + input(buffer, block); + schedule(buffer); + compress(state, buffer); + schedule_1(buffer); + compress(state, buffer); + return output(state); + } + else if constexpr (native && SHA::strength == 256) + { + ////auto state = H::get; + ////auto& wstate = array_cast(state); + ////auto lo = load(wstate[0]); + ////auto hi = load(wstate[1]); + ////shuffle(lo, hi); + ////native_(lo, hi, block); + ////native_(lo, hi, pad_1()); + ////unshuffle(lo, hi); + ////byteswap(lo); + ////byteswap(hi); + ////store(wstate[0], lo); + ////store(wstate[1], hi); + ////return array_cast(state); + + // Simpler but repeats shuffle/unshuffle, re-loads state, and unloads + // state before byteswap. + auto state = H::get; + native_(state, block); + native_preswapped(state, pad_block()); + return output(state); + } + else + { + auto state = H::get; + buffer_t buffer{}; + input(buffer, block); + schedule(buffer); + compress(state, buffer); + schedule_1(buffer); + compress(state, buffer); + return output(state); + } } TEMPLATE constexpr typename CLASS::digest_t CLASS:: hash(const half_t& half) NOEXCEPT { - buffer_t buffer{}; auto state = H::get; + buffer_t buffer{}; input_left(buffer, half); pad_half(buffer); schedule(buffer); @@ -88,8 +155,8 @@ TEMPLATE constexpr typename CLASS::digest_t CLASS:: hash(const half_t& left, const half_t& right) NOEXCEPT { - buffer_t buffer{}; auto state = H::get; + buffer_t buffer{}; input_left(buffer, left); input_right(buffer, right); schedule(buffer); diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_stream.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_stream.ipp index a6c2077e05..cc57b5e302 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_stream.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_stream.ipp @@ -60,6 +60,7 @@ finalize(state_t& state, size_t blocks) NOEXCEPT buffer_t buffer{}; schedule_n(buffer, blocks); compress(state, buffer); + return output(state); } @@ -70,12 +71,14 @@ finalize_second(const state_t& state) NOEXCEPT // No hash(state_t) optimizations for sha160 (requires chunk_t/half_t). static_assert(is_same_type); - buffer_t buffer{}; auto state2 = H::get; + + buffer_t buffer{}; reinput(buffer, state); pad_half(buffer); schedule(buffer); compress(state2, buffer); + return output(state2); } @@ -94,6 +97,7 @@ finalize_double(state_t& state, size_t blocks) NOEXCEPT schedule(buffer); auto state2 = H::get; compress(state2, buffer); + return output(state2); }