diff --git a/src/blake3/blake3.c b/src/blake3/blake3.c
index 692f4b02..6a78c2eb 100644
--- a/src/blake3/blake3.c
+++ b/src/blake3/blake3.c
@@ -1,6 +1,9 @@
 #include <assert.h>
 #include <stdbool.h>
 #include <string.h>
+#include <stdio.h>
+#include <alloca.h>
+#include <pthread.h>
 
 #include "blake3.h"
 #include "blake3_impl.h"
@@ -359,11 +362,16 @@ INLINE void compress_subtree_to_parent_node(
   memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }
 
+INLINE void hasher_init_host_state(blake3_host_state *host_state) {
+  host_state->simd_degree = blake3_simd_degree();
+}
+
 INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
                              uint8_t flags) {
   memcpy(self->key, key, BLAKE3_KEY_LEN);
   chunk_state_init(&self->chunk, key, flags);
   self->cv_stack_len = 0;
+  hasher_init_host_state(&self->host_state);
 }
 
 void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
@@ -614,3 +622,372 @@ void blake3_hasher_reset(blake3_hasher *self) {
   chunk_state_reset(&self->chunk, self->key, 0);
   self->cv_stack_len = 0;
 }
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+// on a single thread. Write out the chunk chaining values and return the
+// number of chunks hashed. These chunks are never the root and never empty;
+// those cases use a different codepath.
+static size_t compress_chunks_parallel_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+	const uint8_t **chunks_array;
+	size_t simd_degree, input_position, chunks_array_len;
+	uint64_t counter;
+	blake3_chunk_state chunk_state;
+	output_t output;
+
+	simd_degree = hs->simd_degree;
+
+	assert(0 < input_len);
+	assert(input_len <= simd_degree * BLAKE3_CHUNK_LEN);
+
+	chunks_array = alloca(sizeof(chunks_array[0]) * simd_degree);
+	input_position = 0;
+	chunks_array_len = 0;
+
+	while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+		chunks_array[chunks_array_len] = &input[input_position];
+		input_position += BLAKE3_CHUNK_LEN;
+		chunks_array_len++;
+	}
+
+	blake3_hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter, true, flags, CHUNK_START, CHUNK_END, out);
+
+	if (input_len <= input_position)
+		return chunks_array_len;
+
+	// Hash the remaining partial chunk, if there is one. Note that the empty
+	// chunk (meaning the empty message) is a different codepath.
+	counter = chunk_counter + (uint64_t)chunks_array_len;
+	chunk_state_init(&chunk_state, key, flags);
+	chunk_state.chunk_counter = counter;
+	chunk_state_update(&chunk_state, &input[input_position], input_len - input_position);
+	output = chunk_state_output(&chunk_state);
+	output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
+	return chunks_array_len + 1;
+}
+
+// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+// on a single thread. Write out the parent chaining values and return the
+// number of parents hashed. (If there's an odd input chaining value left over,
+// return it as an additional output.) These parents are never the root and
+// never empty; those cases use a different codepath.
+static size_t compress_parents_parallel_panto(blake3_host_state *hs, const uint8_t *child_chaining_values, size_t num_chaining_values, const uint32_t key[8], uint8_t flags, uint8_t *out)
+{
+	const uint8_t **parents_array;
+	size_t simd_degree_or_2, parents_array_len = 0;
+
+	simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree;
+
+	assert(2 <= num_chaining_values);
+	assert(num_chaining_values <= 2 * simd_degree_or_2);
+
+	parents_array = alloca(sizeof(parents_array[0]) * simd_degree_or_2);
+	parents_array_len = 0;
+
+	while (num_chaining_values - (2 * parents_array_len) >= 2) {
+		parents_array[parents_array_len] = &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
+		parents_array_len++;
+	}
+
+	blake3_hash_many(parents_array, parents_array_len, 1, key,
+                   0, // Parents always use counter 0.
+                   false, flags | PARENT,
+                   0, // Parents have no start flags.
+                   0, // Parents have no end flags.
+                   out);
+
+	// If there's an odd child left over, it becomes an output.
+	if (num_chaining_values > 2 * parents_array_len) {
+		memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], &child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
+		return parents_array_len + 1;
+	}
+	return parents_array_len;
+}
+
+typedef struct {
+	pthread_t thread;
+	blake3_host_state *hs;
+	const uint8_t *input;
+	size_t input_len;
+	const uint32_t *key;
+	uint64_t chunk_counter;
+	uint8_t flags;
+	uint8_t *out;
+	size_t n;
+} Blake3CompressSubtreeParallelState;
+
+static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out);
+
+void *blake3_compress_subtree_wide_thread_start_panto(void *arg)
+{
+	Blake3CompressSubtreeParallelState *s = arg;
+
+	s->n = blake3_compress_subtree_wide_panto(s->hs, s->input, s->input_len, s->key, s->chunk_counter, s->flags, s->out);
+
+	return NULL;
+}
+
+static size_t blake3_compress_subtree_wide_thread_pool_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t *key, uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+	Blake3CompressSubtreeParallelState s = {
+		.hs = hs,
+		.input = input,
+		.input_len = input_len,
+		.key = key,
+		.chunk_counter = chunk_counter,
+		.flags = flags,
+		.out = out,
+		.n = 0,
+	};
+	int rc;
+
+	rc = pthread_create(&s.thread, NULL, blake3_compress_subtree_wide_thread_start_panto, &s);
+	assert(rc == 0);
+
+#if 0
+	s.n = blake3_compress_subtree_wide_panto(s.hs, s.input, s.input_len, s.key, s.chunk_counter, s.flags, s.out);
+#else
+	rc = pthread_join(s.thread, NULL);
+	assert(rc == 0);
+#endif
+
+	return s.n;
+}
+
+// The wide helper function returns (writes out) an array of chaining values
+// and returns the length of that array. The number of chaining values returned
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// if the input is shorter than that many chunks. The reason for maintaining a
+// wide array of chaining values going back up the tree, is to allow the
+// implementation to hash as many parents in parallel as possible.
+//
+// As a special case when the SIMD degree is 1, this function will still return
+// at least 2 outputs. This guarantees that this function doesn't perform the
+// root compression. (If it did, it would use the wrong flags, and also we
+// wouldn't be able to implement extendable output.) Note that this function is
+// not used when the whole input is only 1 chunk long; that's a different
+// codepath.
+//
+// Why not just have the caller split the input on the first update(), instead
+// of implementing this special rule? Because we don't want to limit SIMD or
+// multi-threading parallelism for that update().
+static size_t blake3_compress_subtree_wide_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+	size_t left_input_len, right_input_len, simd_degree, simd_degree_or_2, degree, left_n, right_n;
+	const uint8_t *left_input, *right_input;
+	uint64_t left_chunk_counter, right_chunk_counter;
+	uint8_t *cv_array, *left_cvs, *right_cvs;
+
+	simd_degree = hs->simd_degree;
+	simd_degree_or_2 = simd_degree < 2 ? 2 : simd_degree;
+
+	// Note that the single chunk case does *not* bump the SIMD degree up to 2
+	// when it is 1. If this implementation adds multi-threading in the future,
+	// this gives us the option of multi-threading even the 2-chunk case, which
+	// can help performance on smaller platforms.
+	if (input_len <= simd_degree * BLAKE3_CHUNK_LEN)
+		return compress_chunks_parallel_panto(hs, input, input_len, key, chunk_counter, flags, out);
+
+	// With more than simd_degree chunks, we need to recurse. Start by dividing
+	// the input into left and right subtrees. (Note that this is only optimal
+	// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
+	// of 3 or something, we'll need a more complicated strategy.)
+	left_input_len = left_len(input_len);
+	right_input_len = input_len - left_input_len;
+
+	left_input = &input[0];
+	right_input = &input[left_input_len];
+	left_chunk_counter = chunk_counter;
+	right_chunk_counter = chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+	// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
+	// account for the special case of returning 2 outputs when the SIMD degree
+	// is 1.
+	cv_array = alloca(2 * simd_degree_or_2 * BLAKE3_OUT_LEN);
+	degree = simd_degree;
+	if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+		// The special case: We always use a degree of at least two, to make
+		// sure there are two outputs. Except, as noted above, at the chunk
+		// level, where we allow degree=1. (Note that the 1-chunk-input case is
+		// a different codepath.)
+		degree = 2;
+	}
+	left_cvs = &cv_array[0];
+	right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+	// Recurse! If this implementation adds multi-threading support in the
+	// future, this is where it will go.
+#if 0
+	left_n = blake3_compress_subtree_wide_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs);
+	right_n = blake3_compress_subtree_wide_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+#else
+	left_n = blake3_compress_subtree_wide_thread_pool_panto(hs, left_input, left_input_len, key, left_chunk_counter, flags, left_cvs);
+	right_n = blake3_compress_subtree_wide_thread_pool_panto(hs, right_input, right_input_len, key, right_chunk_counter, flags, right_cvs);
+#endif
+
+	// The special case again. If simd_degree=1, then we'll have left_n=1 and
+	// right_n=1. Rather than compressing them into a single output, return
+	// them directly, to make sure we always have at least two outputs.
+	if (left_n == 1) {
+		memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+		return 2;
+	}
+
+	// Otherwise, do one layer of parent node compression.
+	return compress_parents_parallel_panto(hs, cv_array, left_n + right_n, key, flags, out);
+}
+
+// Hash a subtree with compress_subtree_wide(), and then condense the resulting
+// list of chaining values down to a single parent node. Don't compress that
+// last parent node, however. Instead, return its message bytes (the
+// concatenated chaining values of its children). This is necessary when the
+// first call to update() supplies a complete subtree, because the topmost
+// parent node of that subtree could end up being the root. It's also necessary
+// for extended output in the general case.
+//
+// As with compress_subtree_wide(), this function is not used on inputs of 1
+// chunk or less. That's a different codepath.
+static void compress_subtree_to_parent_node_panto(blake3_host_state *hs, const uint8_t *input, size_t input_len, const uint32_t key[8], uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
+{
+	uint8_t *cv_array, *out_array;
+	size_t simd_degree_or_2, num_cvs;
+
+	simd_degree_or_2 = hs->simd_degree < 2 ? 2 : hs->simd_degree;
+
+	cv_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN);
+	out_array = alloca(simd_degree_or_2 * BLAKE3_OUT_LEN / 2);
+
+	assert(input_len > BLAKE3_CHUNK_LEN);
+
+	num_cvs = blake3_compress_subtree_wide_panto(hs, input, input_len, key, chunk_counter, flags, cv_array);
+
+	assert(num_cvs <= simd_degree_or_2);
+
+	// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+	// compress_subtree_wide() returns more than 2 chaining values. Condense
+	// them into 2 by forming parent nodes repeatedly.
+	//
+	// The second half of this loop condition is always true, and we just
+	// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
+	// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
+	// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
+	// this code, test it against that version.
+	while (num_cvs > 2 && num_cvs <= simd_degree_or_2) {
+		num_cvs = compress_parents_parallel_panto(hs, cv_array, num_cvs, key, flags, out_array);
+		memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+	}
+	memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+void blake3_hasher_update_panto(blake3_hasher *self, const void *input, size_t input_len, size_t offset, size_t total_size)
+{
+	const uint8_t *input_bytes = input;
+	output_t output;
+	uint8_t cv[BLAKE3_OUT_LEN], cv_pair[2 * BLAKE3_OUT_LEN];
+	blake3_chunk_state chunk_state;
+	size_t take, subtree_len;
+	uint64_t count_so_far, subtree_chunks;
+
+	// fprintf(stderr, "%s: %zu-%zu/%zd %zu\n", __func__, offset, input_len, (ssize_t)total_size, self->chunk.chunk_counter * BLAKE3_CHUNK_LEN + chunk_state_len(&self->chunk));
+
+	// Explicitly checking for zero avoids causing UB by passing a null pointer
+	// to memcpy. This comes up in practice with things like:
+	//   std::vector<uint8_t> v;
+	//   blake3_hasher_update(&hasher, v.data(), v.size());
+	if (input_len <= 0)
+		return;
+
+	// If we have some partial chunk bytes in the internal chunk_state, we need
+	// to finish that chunk first.
+	if (chunk_state_len(&self->chunk) > 0) {
+
+		take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
+		if (take > input_len)
+			take = input_len;
+
+		chunk_state_update(&self->chunk, input_bytes, take);
+		input_bytes += take;
+		input_len -= take;
+
+		// If we've filled the current chunk and there's more coming, finalize this
+		// chunk and proceed. In this case we know it's not the root.
+		if (input_len <= 0)
+			return;
+
+		output = chunk_state_output(&self->chunk);
+		output_chaining_value(&output, cv);
+		hasher_push_cv(self, cv, self->chunk.chunk_counter);
+		chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
+	}
+
+	// Now the chunk_state is clear, and we have more input. If there's more than
+	// a single chunk (so, definitely not the root chunk), hash the largest whole
+	// subtree we can, with the full benefits of SIMD (and maybe in the future,
+	// multi-threading) parallelism. Two restrictions:
+	// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
+	//   the right edge can be incomplete, and we don't know where the right edge
+	//   is going to be until we get to finalize().
+	// - The subtree must evenly divide the total number of chunks up until this
+	//   point (if total is not 0). If the current incomplete subtree is only
+	//   waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
+	//   to complete the current subtree first.
+	// Because we might need to break up the input to form powers of 2, or to
+	// evenly divide what we already have, this part runs in a loop.
+
+	while (input_len > BLAKE3_CHUNK_LEN) {
+
+		subtree_len = round_down_to_power_of_2(input_len);
+		count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+
+		// Shrink the subtree_len until it evenly divides the count so far. We know
+		// that subtree_len itself is a power of 2, so we can use a bitmasking
+		// trick instead of an actual remainder operation. (Note that if the caller
+		// consistently passes power-of-2 inputs of the same size, as is hopefully
+		// typical, this loop condition will always fail, and subtree_len will
+		// always be the full length of the input.)
+		//
+		// An aside: We don't have to shrink subtree_len quite this much. For
+		// example, if count_so_far is 1, we could pass 2 chunks to
+		// compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
+		// get the right answer in the end, and we might get to use 2-way SIMD
+		// parallelism. The problem with this optimization, is that it gets us
+		// stuck always hashing 2 chunks. The total number of chunks will remain
+		// odd, and we'll never graduate to higher degrees of parallelism. See
+		// https://github.com/BLAKE3-team/BLAKE3/issues/69.
+		while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0)
+			subtree_len /= 2;
+
+		// The shrunken subtree_len might now be 1 chunk long. If so, hash that one
+		// chunk by itself. Otherwise, compress the subtree into a pair of CVs.
+		subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+		if (subtree_len <= BLAKE3_CHUNK_LEN) {
+			chunk_state_init(&chunk_state, self->key, self->chunk.flags);
+			chunk_state.chunk_counter = self->chunk.chunk_counter;
+			chunk_state_update(&chunk_state, input_bytes, subtree_len);
+			output_t output = chunk_state_output(&chunk_state);
+			output_chaining_value(&output, cv);
+			hasher_push_cv(self, cv, chunk_state.chunk_counter);
+		} else {
+			compress_subtree_to_parent_node_panto(&self->host_state, input_bytes, subtree_len, self->key, self->chunk.chunk_counter, self->chunk.flags, cv_pair);
+			hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
+			hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
+			self->chunk.chunk_counter + (subtree_chunks / 2));
+		}
+		self->chunk.chunk_counter += subtree_chunks;
+		input_bytes += subtree_len;
+		input_len -= subtree_len;
+	}
+
+	// If there's any remaining input less than a full chunk, add it to the chunk
+	// state. In that case, also do a final merge loop to make sure the subtree
+	// stack doesn't contain any unmerged pairs. The remaining input means we
+	// know these merges are non-root. This merge loop isn't strictly necessary
+	// here, because hasher_push_chunk_cv already does its own merge loop, but it
+	// simplifies blake3_hasher_finalize below.
+	if (input_len > 0) {
+		chunk_state_update(&self->chunk, input_bytes, input_len);
+		hasher_merge_cv_stack(self, self->chunk.chunk_counter);
+	}
+}
diff --git a/src/blake3/blake3.h b/src/blake3/blake3.h
index 21e0d7b9..15fe1f22 100644
--- a/src/blake3/blake3.h
+++ b/src/blake3/blake3.h
@@ -49,6 +49,11 @@ typedef struct {
 } blake3_chunk_state;
 
 typedef struct {
+  size_t simd_degree;
+} blake3_host_state;
+
+typedef struct {
+  blake3_host_state host_state;
   uint32_t key[8];
   blake3_chunk_state chunk;
   uint8_t cv_stack_len;
@@ -75,6 +80,8 @@ BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t
                                             uint8_t *out, size_t out_len);
 BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
 
+BLAKE3_API void blake3_hasher_update_panto(blake3_hasher *self, const void *input,
+                                           size_t input_len, size_t offset, size_t total_size);
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/internal/fy-b3sum.c b/src/internal/fy-b3sum.c
index 21b1bfbd..57707d58 100644
--- a/src/internal/fy-b3sum.c
+++ b/src/internal/fy-b3sum.c
@@ -15,58 +15,175 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 #include <alloca.h>
+#include <stdbool.h>
+#include <getopt.h>
 
 #include <blake3.h>
 
+#define OPT_NO_MMAP		128
+
+static struct option lopts[] = {
+	{"no-mmap",		no_argument,		0,	OPT_NO_MMAP },
+	{"help",		no_argument,		0,	'h' },
+	{0,			0,              	0,	 0  },
+};
+
+static void display_usage(FILE *fp, char *progname)
+{
+	fprintf(fp, "Usage: %s [options] [files]\n", progname);
+	fprintf(fp, "\nOptions:\n\n");
+	fprintf(fp, "\t--help, -h               : Display  help message\n");
+
+	if (fp == stderr)
+		exit(EXIT_FAILURE);
+}
+
 int main(int argc, char *argv[])
 {
+	struct stat sb;
+	void *mem = NULL;
 	blake3_hasher hasher;
 	uint8_t output[BLAKE3_OUT_LEN];
 	const char *filename;
 	uint8_t *buf;
 	size_t bufsz = 1 << 16;	/* 64K */
-	size_t rdn;
-	FILE *fp;
-	int i;
-
-	if (argc > 1) {
-		filename = argv[1];
-		fp = fopen(filename, "rb");
-		if (!fp) {
-			fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno));
-			return EXIT_FAILURE;
+	size_t rdn, filesize, offset;
+	FILE *fp = NULL;
+	int i, opt, lidx, rc, num_inputs, num_ok, fd = -1, exitcode;
+	bool no_mmap = false;
+
+	exitcode = EXIT_FAILURE;
+
+	while ((opt = getopt_long_only(argc, argv, "h", lopts, &lidx)) != -1) {
+		switch (opt) {
+		case OPT_NO_MMAP:
+			no_mmap = true;
+			break;
+		case 'h' :
+		default:
+			if (opt != 'h')
+				fprintf(stderr, "Unknown option\n");
+			display_usage(opt == 'h' ? stdout : stderr, argv[0]);
+			return EXIT_SUCCESS;
 		}
-	} else {
-		filename = "<stdin>";
-		fp = stdin;
 	}
 
-	// Initialize the hasher.
-	blake3_hasher_init(&hasher);
+	buf = NULL;
+
+	num_inputs = argc - optind;
+	if (num_inputs <= 0) {
+		fprintf(stderr, "Missing file(s) argument\n");
+		display_usage(stderr, argv[0]);
+		return EXIT_FAILURE;
+	}
+
+	num_ok = 0;
+	for (i = optind; i < argc; i++) {
+
+		filename = argv[i];
+
+		if (!strcmp(filename, "-")) {
+			fp = stdin;
+		} else {
+			fp = NULL;
+
+			fd = open(filename, O_RDONLY);
+			if (fd < 0) {
+				fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno));
+				goto err_out;
+			}
+
+			rc = fstat(fd, &sb);
+			if (rc < 0) {
+				fprintf(stderr, "failed to stat %s - %s\n", filename, strerror(errno));
+				goto err_out;
+			}
 
-	buf = alloca(bufsz);
+			filesize = (size_t)-1;
 
-	do {
-		rdn = fread(buf, 1, bufsz, fp);
-		if (rdn < 0) {
-			fprintf(stderr, "error on read of %s - %s\n", filename, strerror(errno));
-			return EXIT_FAILURE;
+			/* try to mmap */
+			if (sb.st_size > 0 && !no_mmap) {
+				filesize = sb.st_size;
+				mem = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+				if (mem == MAP_FAILED)
+					mem = NULL;
+			}
+
+			/* unable to map? fallback to stream mode */
+			if (!mem) {
+				fp = fdopen(fd, "r");
+				if (!fp) {
+					fprintf(stderr, "unable to open %s - %s\n", filename, strerror(errno));
+					goto err_out;
+				}
+			} else
+				fp = NULL;
+
+		}
+
+		// Initialize the hasher.
+		blake3_hasher_init(&hasher);
+
+		if (mem) {
+
+			blake3_hasher_update_panto(&hasher, mem, filesize, 0, filesize);
+
+		} else if (fp) {
+
+			if (!buf)
+				buf = alloca(bufsz);
+
+			offset = 0;
+			do {
+				rdn = fread(buf, 1, bufsz, fp);
+				if (rdn < 0) {
+					fprintf(stderr, "error on read of %s - %s\n", filename, strerror(errno));
+					goto err_out;
+				}
+				if (rdn > 0)
+					blake3_hasher_update_panto(&hasher, buf, rdn, offset, (size_t)-1);
+				offset += rdn;
+			} while (rdn >= bufsz);
+
+		} else {
+			fprintf(stderr, "Can't do anything for %s\n", filename);
+			goto err_out;
+		}
+
+		// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+		blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+		// Print the hash as hexadecimal.
+		for (i = 0; i < BLAKE3_OUT_LEN; i++)
+		    printf("%02x", output[i]);
+		printf("  %s\n", filename);
+
+		if (fp) {
+			if (fp != stdin)
+				fclose(fp);
+			fp = NULL;
 		}
-		if (rdn > 0)
-			blake3_hasher_update(&hasher, buf, rdn);
-	} while (rdn >= bufsz);
+		if (fd >= 0)
+			close(fd);
 
-	// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
-	blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+		num_ok++;
+	}
+	if (num_inputs == num_ok)
+		exitcode = EXIT_SUCCESS;
 
-	// Print the hash as hexadecimal.
-	for (i = 0; i < BLAKE3_OUT_LEN; i++)
-	    printf("%02x", output[i]);
-	printf("\n");
+err_out:
 
-	if (fp != stdin)
+	if (fp && fp != stdin)
 		fclose(fp);
 
-	return 0;
+	if (fd >= 0)
+		close(fd);
+
+	return exitcode;
 }