diff --git a/src/blake3/blake3.c b/src/blake3/blake3.c index 692f4b02..6a78c2eb 100644 --- a/src/blake3/blake3.c +++ b/src/blake3/blake3.c @@ -1,6 +1,9 @@ #include #include #include +#include +#include +#include #include "blake3.h" #include "blake3_impl.h" @@ -359,11 +362,16 @@ INLINE void compress_subtree_to_parent_node( memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); } +INLINE void hasher_init_host_state(blake3_host_state *host_state) { + host_state->simd_degree = blake3_simd_degree(); +} + INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8], uint8_t flags) { memcpy(self->key, key, BLAKE3_KEY_LEN); chunk_state_init(&self->chunk, key, flags); self->cv_stack_len = 0; + hasher_init_host_state(&self->host_state); } void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); } @@ -614,3 +622,372 @@ void blake3_hasher_reset(blake3_hasher *self) { chunk_state_reset(&self->chunk, self->key, 0); self->cv_stack_len = 0; } + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +static size_t compress_chunks_parallel_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + const uint8_t **chunks_array; + size_t simd_degree, input_position, chunks_array_len; + uint64_t counter; + blake3_chunk_state chunk_state; + output_t output; + + simd_degree = hs->simd_degree; + + assert(0 < input_len); + assert(input_len <= simd_degree * BLAKE3_CHUNK_LEN); + + chunks_array = alloca(sizeof(chunks_array[0]) * simd_degree); + input_position = 0; + chunks_array_len = 0; + + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len++; + } + + blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out); + + if (input_len <= input_position) + return chunks_array_len; + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + counter = chunk_counter + (uint64_t)chunks_array_len; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(&chunk_state, &input[input_position], input_len - input_position); + output = chunk_state_output(&chunk_state); + output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]); + return chunks_array_len + 1; +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +static size_t compress_parents_parallel_panto(blake3_host_state *hs, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out) +{ + const uint8_t **parents_array; + size_t simd_degree_or_2, parents_array_len = 0; + + simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree; + + assert(2 <= num_chaining_values); + assert(num_chaining_values <= 2 * simd_degree_or_2); + + parents_array = alloca(sizeof(parents_array[0]) * simd_degree_or_2); + parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len++; + } + + blake3_hash_many(parents_array, parents_array_len, 1, key, + 0, // Parents always use counter 0. + false, flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out); + + // If there's an odd child left over, it becomes an output. + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return parents_array_len + 1; + } + return parents_array_len; +} + +typedef struct { + pthread_t thread; + blake3_host_state *hs; + const uint8_t *input; + size_t input_len; + const uint32_t *key; + uint64_t chunk_counter; + uint8_t flags; + uint8_t *out; + size_t n; +} Blake3CompressSubtreeParallelState; + +static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out); + +void *blake3_compress_subtree_wide_thread_start_panto(void *arg) +{ + Blake3CompressSubtreeParallelState *s = arg; + + s->n = blake3_compress_subtree_wide_panto(s->hs, s->input, s->input_len, s->key, s->chunk_counter, s->flags, s->out); + + return NULL; +} + +static size_t blake3_compress_subtree_wide_thread_pool_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t *key, uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + Blake3CompressSubtreeParallelState s = { + .hs = hs, + .input = input, + .input_len = input_len, + .key = key, + .chunk_counter = chunk_counter, + .flags = flags, + .out = out, + .n = 0, + }; + int rc; + + rc = pthread_create(&s.thread, NULL, blake3_compress_subtree_wide_thread_start_panto, &s); + assert(rc == 0); + +#if 0 + s.n = blake3_compress_subtree_wide_panto(s.hs, s.input, s.input_len, s.key, s.chunk_counter, s.flags, s.out); +#else + rc = pthread_join(s.thread, NULL); + assert(rc == 0); +#endif + + return s.n; +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement extendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multi-threading parallelism for that update(). +static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + size_t left_input_len, right_input_len, simd_degree, simd_degree_or_2, degree, left_n, right_n; + const uint8_t *left_input, *right_input; + uint64_t left_chunk_counter, right_chunk_counter; + uint8_t *cv_array, *left_cvs, *right_cvs; + + simd_degree = hs->simd_degree; + simd_degree_or_2 = simd_degree < 2 ? 2 : simd_degree; + + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. If this implementation adds multi-threading in the future, + // this gives us the option of multi-threading even the 2-chunk case, which + // can help performance on smaller platforms. + if (input_len <= simd_degree * BLAKE3_CHUNK_LEN) + return compress_chunks_parallel_panto(hs, input, input_len, key, chunk_counter, flags, out); + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + left_input_len = left_len(input_len); + right_input_len = input_len - left_input_len; + + left_input = &input[0]; + right_input = &input[left_input_len]; + left_chunk_counter = chunk_counter; + right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + cv_array = alloca(2 * simd_degree_or_2 * BLAKE3_OUT_LEN); + degree = simd_degree; + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + // The special case: We always use a degree of at least two, to make + // sure there are two outputs. Except, as noted above, at the chunk + // level, where we allow degree=1. (Note that the 1-chunk-input case is + // a different codepath.) + degree = 2; + } + left_cvs = &cv_array[0]; + right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + // Recurse! If this implementation adds multi-threading support in the + // future, this is where it will go. +#if 0 + left_n = blake3_compress_subtree_wide_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs); + right_n = blake3_compress_subtree_wide_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); +#else + left_n = blake3_compress_subtree_wide_thread_pool_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs); + right_n = blake3_compress_subtree_wide_thread_pool_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs); +#endif + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return 2; + } + + // Otherwise, do one layer of parent node compression. + return compress_parents_parallel_panto(hs, cv_array, left_n + right_n, key, flags, out); +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +static void compress_subtree_to_parent_node_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t *cv_array, *out_array; + size_t simd_degree_or_2, num_cvs; + + simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree; + + cv_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN); + out_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN / 2); + + assert(input_len > BLAKE3_CHUNK_LEN); + + num_cvs = blake3_compress_subtree_wide_panto(hs, input, input_len, key, chunk_counter, flags, cv_array); + + assert(num_cvs <= simd_degree_or_2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + // + // The second half of this loop condition is always true, and we just + // asserted it above. But GCC can't tell that it's always true, and if NDEBUG + // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious + // warnings here. GCC 8.5 is particularly sensitive, so if you're changing + // this code, test it against that version. + while (num_cvs > 2 && num_cvs <= simd_degree_or_2) { + num_cvs = compress_parents_parallel_panto(hs, cv_array, num_cvs, key, flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +void blake3_hasher_update_panto(blake3_hasher *self, const void *input, size_t input_len, size_t offset, size_t total_size) +{ + const uint8_t *input_bytes = input; + output_t output; + uint8_t cv[BLAKE3_OUT_LEN], cv_pair[2 * BLAKE3_OUT_LEN]; + blake3_chunk_state chunk_state; + size_t take, subtree_len; + uint64_t count_so_far, subtree_chunks; + + // fprintf(stderr, "%s: %zu-%zu/%zd %zu\n", __func__, offset, input_len, (ssize_t)total_size, self->chunk.chunk_counter * BLAKE3_CHUNK_LEN + chunk_state_len(&self->chunk)); + + // Explicitly checking for zero avoids causing UB by passing a null pointer + // to memcpy. This comes up in practice with things like: + // std::vector v; + // blake3_hasher_update(&hasher, v.data(), v.size()); + if (input_len <= 0) + return; + + // If we have some partial chunk bytes in the internal chunk_state, we need + // to finish that chunk first. + if (chunk_state_len(&self->chunk) > 0) { + + take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk); + if (take > input_len) + take = input_len; + + chunk_state_update(&self->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + + // If we've filled the current chunk and there's more coming, finalize this + // chunk and proceed. In this case we know it's not the root. + if (input_len <= 0) + return; + + output = chunk_state_output(&self->chunk); + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, self->chunk.chunk_counter); + chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1); + } + + // Now the chunk_state is clear, and we have more input. If there's more than + // a single chunk (so, definitely not the root chunk), hash the largest whole + // subtree we can, with the full benefits of SIMD (and maybe in the future, + // multi-threading) parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees along + // the right edge can be incomplete, and we don't know where the right edge + // is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until this + // point (if total is not 0). If the current incomplete subtree is only + // waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have + // to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or to + // evenly divide what we already have, this part runs in a loop. + + while (input_len > BLAKE3_CHUNK_LEN) { + + subtree_len = round_down_to_power_of_2(input_len); + count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + + // Shrink the subtree_len until it evenly divides the count so far. We know + // that subtree_len itself is a power of 2, so we can use a bitmasking + // trick instead of an actual remainder operation. (Note that if the caller + // consistently passes power-of-2 inputs of the same size, as is hopefully + // typical, this loop condition will always fail, and subtree_len will + // always be the full length of the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. For + // example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still + // get the right answer in the end, and we might get to use 2-way SIMD + // parallelism. The problem with this optimization, is that it gets us + // stuck always hashing 2 chunks. The total number of chunks will remain + // odd, and we'll never graduate to higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) + subtree_len /= 2; + + // The shrunken subtree_len might now be 1 chunk long. If so, hash that one + // chunk by itself. Otherwise, compress the subtree into a pair of CVs. + subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + chunk_state_init(&chunk_state, self->key, self->chunk.flags); + chunk_state.chunk_counter = self->chunk.chunk_counter; + chunk_state_update(&chunk_state, input_bytes, subtree_len); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(&output, cv); + hasher_push_cv(self, cv, chunk_state.chunk_counter); + } else { + compress_subtree_to_parent_node_panto(&self->host_state, input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair); + hasher_push_cv(self, cv_pair, self->chunk.chunk_counter); + hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN], + self->chunk.chunk_counter + (subtree_chunks / 2)); + } + self->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + // If there's any remaining input less than a full chunk, add it to the chunk + // state. In that case, also do a final merge loop to make sure the subtree + // stack doesn't contain any unmerged pairs. The remaining input means we + // know these merges are non-root. This merge loop isn't strictly necessary + // here, because hasher_push_chunk_cv already does its own merge loop, but it + // simplifies blake3_hasher_finalize below. + if (input_len > 0) { + chunk_state_update(&self->chunk, input_bytes, input_len); + hasher_merge_cv_stack(self, self->chunk.chunk_counter); + } +} diff --git a/src/blake3/blake3.h b/src/blake3/blake3.h index 21e0d7b9..15fe1f22 100644 --- a/src/blake3/blake3.h +++ b/src/blake3/blake3.h @@ -49,6 +49,11 @@ typedef struct { } blake3_chunk_state; typedef struct { + size_t simd_degree; +} blake3_host_state; + +typedef struct { + blake3_host_state host_state; uint32_t key[8]; blake3_chunk_state chunk; uint8_t cv_stack_len; @@ -75,6 +80,8 @@ BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t uint8_t *out, size_t out_len); BLAKE3_API void blake3_hasher_reset(blake3_hasher *self); +BLAKE3_API void blake3_hasher_update_panto(blake3_hasher *self, const void *input, + size_t input_len, size_t offset, size_t total_size); #ifdef __cplusplus } #endif diff --git a/src/internal/fy-b3sum.c b/src/internal/fy-b3sum.c index 21b1bfbd..57707d58 100644 --- a/src/internal/fy-b3sum.c +++ b/src/internal/fy-b3sum.c @@ -15,58 +15,175 @@ #include #include #include +#include +#include +#include +#include +#include #include +#include +#include #include +#define OPT_NO_MMAP 128 + +static struct option lopts[] = { + {"no-mmap", no_argument, 0, OPT_NO_MMAP }, + {"help", no_argument, 0, 'h' }, + {0, 0, 0, 0 }, +}; + +static void display_usage(FILE *fp, char *progname) +{ + fprintf(fp, "Usage: %s [options] [files]\n", progname); + fprintf(fp, "\nOptions:\n\n"); + fprintf(fp, "\t--help, -h : Display help message\n"); + + if (fp == stderr) + exit(EXIT_FAILURE); +} + int main(int argc, char *argv[]) { + struct stat sb; + void *mem = NULL; blake3_hasher hasher; uint8_t output[BLAKE3_OUT_LEN]; const char *filename; uint8_t *buf; size_t bufsz = 1 << 16; /* 64K */ - size_t rdn; - FILE *fp; - int i; - - if (argc > 1) { - filename = argv[1]; - fp = fopen(filename, "rb"); - if (!fp) { - fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno)); - return EXIT_FAILURE; + size_t rdn, filesize, offset; + FILE *fp = NULL; + int i, opt, lidx, rc, num_inputs, num_ok, fd = -1, exitcode; + bool no_mmap = false; + + exitcode = EXIT_FAILURE; + + while ((opt = getopt_long_only(argc, argv, "h", lopts, &lidx)) != -1) { + switch (opt) { + case OPT_NO_MMAP: + no_mmap = true; + break; + case 'h' : + default: + if (opt != 'h') + fprintf(stderr, "Unknown option\n"); + display_usage(opt == 'h' ? stdout : stderr, argv[0]); + return EXIT_SUCCESS; } - } else { - filename = ""; - fp = stdin; } - // Initialize the hasher. - blake3_hasher_init(&hasher); + buf = NULL; + + num_inputs = argc - optind; + if (num_inputs <= 0) { + fprintf(stderr, "Missing file(s) argument\n"); + display_usage(stderr, argv[0]); + return EXIT_FAILURE; + } + + num_ok = 0; + for (i = optind; i < argc; i++) { + + filename = argv[i]; + + if (!strcmp(filename, "-")) { + fp = stdin; + } else { + fp = NULL; + + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno)); + goto err_out; + } + + rc = fstat(fd, &sb); + if (rc < 0) { + fprintf(stderr, "failed to stat %s - %s\n", filename, strerror(errno)); + goto err_out; + } - buf = alloca(bufsz); + filesize = (size_t)-1; - do { - rdn = fread(buf, 1, bufsz, fp); - if (rdn < 0) { - fprintf(stderr, "error on read of %s - %s\n", filename, strerror(errno)); - return EXIT_FAILURE; + /* try to mmap */ + if (sb.st_size > 0 && !no_mmap) { + filesize = sb.st_size; + mem = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) + mem = NULL; + } + + /* unable to map? fallback to stream mode */ + if (!mem) { + fp = fdopen(fd, "r"); + if (!fp) { + fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno)); + goto err_out; + } + } else + fp = NULL; + + } + + // Initialize the hasher. + blake3_hasher_init(&hasher); + + if (mem) { + + blake3_hasher_update_panto(&hasher, mem, filesize, 0, filesize); + + } else if (fp) { + + if (!buf) + buf = alloca(bufsz); + + offset = 0; + do { + rdn = fread(buf, 1, bufsz, fp); + if (rdn < 0) { + fprintf(stderr, "error on read of %s - %s\n", filename, strerror(errno)); + goto err_out; + } + if (rdn > 0) + blake3_hasher_update_panto(&hasher, buf, rdn, offset, (size_t)-1); + offset += rdn; + } while (rdn >= bufsz); + + } else { + fprintf(stderr, "Can't do anything for %s\n", filename); + goto err_out; + } + + // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. + blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + + // Print the hash as hexadecimal. + for (i = 0; i < BLAKE3_OUT_LEN; i++) + printf("%02x", output[i]); + printf(" %s\n", filename); + + if (fp) { + if (fp != stdin) + fclose(fp); + fp = NULL; } - if (rdn > 0) - blake3_hasher_update(&hasher, buf, rdn); - } while (rdn >= bufsz); + if (fd >= 0) + close(fd); - // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes. - blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN); + num_ok++; + } + if (num_inputs == num_ok) + exitcode = EXIT_SUCCESS; - // Print the hash as hexadecimal. - for (i = 0; i < BLAKE3_OUT_LEN; i++) - printf("%02x", output[i]); - printf("\n"); +err_out: - if (fp != stdin) + if (fp && fp != stdin) fclose(fp); - return 0; + if (fd >= 0) + close(fd); + + return exitcode; }