Skip to content

Commit

Permalink
wip on blake3
Browse files Browse the repository at this point in the history
Signed-off-by: Pantelis Antoniou <[email protected]>
  • Loading branch information
pantoniou committed Aug 31, 2023
1 parent 7a62257 commit 73d17d7
Show file tree
Hide file tree
Showing 3 changed files with 533 additions and 32 deletions.
377 changes: 377 additions & 0 deletions src/blake3/blake3.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <alloca.h>
#include <pthread.h>

#include "blake3.h"
#include "blake3_impl.h"
Expand Down Expand Up @@ -359,11 +362,16 @@ INLINE void compress_subtree_to_parent_node(
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}

INLINE void hasher_init_host_state(blake3_host_state *host_state) {
host_state->simd_degree = blake3_simd_degree();
}

INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->key, key, BLAKE3_KEY_LEN);
chunk_state_init(&self->chunk, key, flags);
self->cv_stack_len = 0;
hasher_init_host_state(&self->host_state);
}

void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
Expand Down Expand Up @@ -614,3 +622,372 @@ void blake3_hasher_reset(blake3_hasher *self) {
chunk_state_reset(&self->chunk, self->key, 0);
self->cv_stack_len = 0;
}

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
// on a single thread. Write out the chunk chaining values and return the
// number of chunks hashed. These chunks are never the root and never empty;
// those cases use a different codepath.
static size_t compress_chunks_parallel_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
{
const uint8_t **chunks_array;
size_t simd_degree, input_position, chunks_array_len;
uint64_t counter;
blake3_chunk_state chunk_state;
output_t output;

simd_degree = hs->simd_degree;

assert(0 < input_len);
assert(input_len <= simd_degree * BLAKE3_CHUNK_LEN);

chunks_array = alloca(sizeof(chunks_array[0]) * simd_degree);
input_position = 0;
chunks_array_len = 0;

while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
chunks_array[chunks_array_len] = &input[input_position];
input_position += BLAKE3_CHUNK_LEN;
chunks_array_len++;
}

blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out);

if (input_len <= input_position)
return chunks_array_len;

// Hash the remaining partial chunk, if there is one. Note that the empty
// chunk (meaning the empty message) is a different codepath.
counter = chunk_counter + (uint64_t)chunks_array_len;
chunk_state_init(&chunk_state, key, flags);
chunk_state.chunk_counter = counter;
chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
output = chunk_state_output(&chunk_state);
output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
return chunks_array_len + 1;
}

// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
// on a single thread. Write out the parent chaining values and return the
// number of parents hashed. (If there's an odd input chaining value left over,
// return it as an additional output.) These parents are never the root and
// never empty; those cases use a different codepath.
static size_t compress_parents_parallel_panto(blake3_host_state *hs, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out)
{
const uint8_t **parents_array;
size_t simd_degree_or_2, parents_array_len = 0;

simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree;

assert(2 <= num_chaining_values);
assert(num_chaining_values <= 2 * simd_degree_or_2);

parents_array = alloca(sizeof(parents_array[0]) * simd_degree_or_2);
parents_array_len = 0;

while (num_chaining_values - (2 * parents_array_len) >= 2) {
parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
parents_array_len++;
}

blake3_hash_many(parents_array, parents_array_len, 1, key,
0, // Parents always use counter 0.
false, flags | PARENT,
0, // Parents have no start flags.
0, // Parents have no end flags.
out);

// If there's an odd child left over, it becomes an output.
if (num_chaining_values > 2 * parents_array_len) {
memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
return parents_array_len + 1;
}
return parents_array_len;
}

typedef struct {
pthread_t thread;
blake3_host_state *hs;
const uint8_t *input;
size_t input_len;
const uint32_t *key;
uint64_t chunk_counter;
uint8_t flags;
uint8_t *out;
size_t n;
} Blake3CompressSubtreeParallelState;

static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out);

void *blake3_compress_subtree_wide_thread_start_panto(void *arg)
{
Blake3CompressSubtreeParallelState *s = arg;

s->n = blake3_compress_subtree_wide_panto(s->hs, s->input, s->input_len, s->key, s->chunk_counter, s->flags, s->out);

return NULL;
}

static size_t blake3_compress_subtree_wide_thread_pool_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t *key, uint64_t chunk_counter, uint8_t flags, uint8_t *out)
{
Blake3CompressSubtreeParallelState s = {
.hs = hs,
.input = input,
.input_len = input_len,
.key = key,
.chunk_counter = chunk_counter,
.flags = flags,
.out = out,
.n = 0,
};
int rc;

rc = pthread_create(&s.thread, NULL, blake3_compress_subtree_wide_thread_start_panto, &s);
assert(rc == 0);

#if 0
s.n = blake3_compress_subtree_wide_panto(s.hs, s.input, s.input_len, s.key, s.chunk_counter, s.flags, s.out);
#else
rc = pthread_join(s.thread, NULL);
assert(rc == 0);
#endif

return s.n;
}

// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
//
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
{
size_t left_input_len, right_input_len, simd_degree, simd_degree_or_2, degree, left_n, right_n;
const uint8_t *left_input, *right_input;
uint64_t left_chunk_counter, right_chunk_counter;
uint8_t *cv_array, *left_cvs, *right_cvs;

simd_degree = hs->simd_degree;
simd_degree_or_2 = simd_degree < 2 ? 2 : simd_degree;

// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
// can help performance on smaller platforms.
if (input_len <= simd_degree * BLAKE3_CHUNK_LEN)
return compress_chunks_parallel_panto(hs, input, input_len, key, chunk_counter, flags, out);

// With more than simd_degree chunks, we need to recurse. Start by dividing
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
left_input_len = left_len(input_len);
right_input_len = input_len - left_input_len;

left_input = &input[0];
right_input = &input[left_input_len];
left_chunk_counter = chunk_counter;
right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);

// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
// account for the special case of returning 2 outputs when the SIMD degree
// is 1.
cv_array = alloca(2 * simd_degree_or_2 * BLAKE3_OUT_LEN);
degree = simd_degree;
if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
// The special case: We always use a degree of at least two, to make
// sure there are two outputs. Except, as noted above, at the chunk
// level, where we allow degree=1. (Note that the 1-chunk-input case is
// a different codepath.)
degree = 2;
}
left_cvs = &cv_array[0];
right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];

// Recurse! If this implementation adds multi-threading support in the
// future, this is where it will go.
#if 0
left_n = blake3_compress_subtree_wide_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs);
right_n = blake3_compress_subtree_wide_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
#else
left_n = blake3_compress_subtree_wide_thread_pool_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs);
right_n = blake3_compress_subtree_wide_thread_pool_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
#endif

// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
// them directly, to make sure we always have at least two outputs.
if (left_n == 1) {
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
return 2;
}

// Otherwise, do one layer of parent node compression.
return compress_parents_parallel_panto(hs, cv_array, left_n + right_n, key, flags, out);
}

// Hash a subtree with compress_subtree_wide(), and then condense the resulting
// list of chaining values down to a single parent node. Don't compress that
// last parent node, however. Instead, return its message bytes (the
// concatenated chaining values of its children). This is necessary when the
// first call to update() supplies a complete subtree, because the topmost
// parent node of that subtree could end up being the root. It's also necessary
// for extended output in the general case.
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a different codepath.
static void compress_subtree_to_parent_node_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
{
uint8_t *cv_array, *out_array;
size_t simd_degree_or_2, num_cvs;

simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree;

cv_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN);
out_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN / 2);

assert(input_len > BLAKE3_CHUNK_LEN);

num_cvs = blake3_compress_subtree_wide_panto(hs, input, input_len, key, chunk_counter, flags, cv_array);

assert(num_cvs <= simd_degree_or_2);

// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
//
// The second half of this loop condition is always true, and we just
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
// this code, test it against that version.
while (num_cvs > 2 && num_cvs <= simd_degree_or_2) {
num_cvs = compress_parents_parallel_panto(hs, cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}

void blake3_hasher_update_panto(blake3_hasher *self, const void *input, size_t input_len, size_t offset, size_t total_size)
{
const uint8_t *input_bytes = input;
output_t output;
uint8_t cv[BLAKE3_OUT_LEN], cv_pair[2 * BLAKE3_OUT_LEN];
blake3_chunk_state chunk_state;
size_t take, subtree_len;
uint64_t count_so_far, subtree_chunks;

// fprintf(stderr, "%s: %zu-%zu/%zd %zu\n", __func__, offset, input_len, (ssize_t)total_size, self->chunk.chunk_counter * BLAKE3_CHUNK_LEN + chunk_state_len(&self->chunk));

// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
// blake3_hasher_update(&hasher, v.data(), v.size());
if (input_len <= 0)
return;

// If we have some partial chunk bytes in the internal chunk_state, we need
// to finish that chunk first.
if (chunk_state_len(&self->chunk) > 0) {

take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
if (take > input_len)
take = input_len;

chunk_state_update(&self->chunk, input_bytes, take);
input_bytes += take;
input_len -= take;

// If we've filled the current chunk and there's more coming, finalize this
// chunk and proceed. In this case we know it's not the root.
if (input_len <= 0)
return;

output = chunk_state_output(&self->chunk);
output_chaining_value(&output, cv);
hasher_push_cv(self, cv, self->chunk.chunk_counter);
chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
}

// Now the chunk_state is clear, and we have more input. If there's more than
// a single chunk (so, definitely not the root chunk), hash the largest whole
// subtree we can, with the full benefits of SIMD (and maybe in the future,
// multi-threading) parallelism. Two restrictions:
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
// the right edge can be incomplete, and we don't know where the right edge
// is going to be until we get to finalize().
// - The subtree must evenly divide the total number of chunks up until this
// point (if total is not 0). If the current incomplete subtree is only
// waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
// to complete the current subtree first.
// Because we might need to break up the input to form powers of 2, or to
// evenly divide what we already have, this part runs in a loop.

while (input_len > BLAKE3_CHUNK_LEN) {

subtree_len = round_down_to_power_of_2(input_len);
count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;

// Shrink the subtree_len until it evenly divides the count so far. We know
// that subtree_len itself is a power of 2, so we can use a bitmasking
// trick instead of an actual remainder operation. (Note that if the caller
// consistently passes power-of-2 inputs of the same size, as is hopefully
// typical, this loop condition will always fail, and subtree_len will
// always be the full length of the input.)
//
// An aside: We don't have to shrink subtree_len quite this much. For
// example, if count_so_far is 1, we could pass 2 chunks to
// compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
// get the right answer in the end, and we might get to use 2-way SIMD
// parallelism. The problem with this optimization, is that it gets us
// stuck always hashing 2 chunks. The total number of chunks will remain
// odd, and we'll never graduate to higher degrees of parallelism. See
// https://github.com/BLAKE3-team/BLAKE3/issues/69.
while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0)
subtree_len /= 2;

// The shrunken subtree_len might now be 1 chunk long. If so, hash that one
// chunk by itself. Otherwise, compress the subtree into a pair of CVs.
subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
if (subtree_len <= BLAKE3_CHUNK_LEN) {
chunk_state_init(&chunk_state, self->key, self->chunk.flags);
chunk_state.chunk_counter = self->chunk.chunk_counter;
chunk_state_update(&chunk_state, input_bytes, subtree_len);
output_t output = chunk_state_output(&chunk_state);
output_chaining_value(&output, cv);
hasher_push_cv(self, cv, chunk_state.chunk_counter);
} else {
compress_subtree_to_parent_node_panto(&self->host_state, input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
}
self->chunk.chunk_counter += subtree_chunks;
input_bytes += subtree_len;
input_len -= subtree_len;
}

// If there's any remaining input less than a full chunk, add it to the chunk
// state. In that case, also do a final merge loop to make sure the subtree
// stack doesn't contain any unmerged pairs. The remaining input means we
// know these merges are non-root. This merge loop isn't strictly necessary
// here, because hasher_push_chunk_cv already does its own merge loop, but it
// simplifies blake3_hasher_finalize below.
if (input_len > 0) {
chunk_state_update(&self->chunk, input_bytes, input_len);
hasher_merge_cv_stack(self, self->chunk.chunk_counter);
}
}
Loading

0 comments on commit 73d17d7

Please sign in to comment.