diff --git a/lib/BUILD.bazel b/lib/BUILD.bazel
index 72143cf2e..009ddf8f7 100644
--- a/lib/BUILD.bazel
+++ b/lib/BUILD.bazel
@@ -474,6 +474,7 @@ cc_library(
 		"nnc/_ccv_cnnp_model.h",
 		"nnc/ccv_nnc_tensor.c",
 		"nnc/ccv_nnc_tensor_io.c",
+		"nnc/ccv_nnc_tensor_palettize.c",
 		"nnc/ccv_nnc_tensor_tape.c",
 		"nnc/ccv_nnc_cmd.c",
 		"nnc/ccv_nnc_stream.c",
diff --git a/lib/nnc/ccv_nnc.h b/lib/nnc/ccv_nnc.h
index c876ed271..8ee3e54ac 100644
--- a/lib/nnc/ccv_nnc.h
+++ b/lib/nnc/ccv_nnc.h
@@ -844,6 +844,31 @@ void ccv_nnc_set_profiler(int state);
  * @param state 1 is on, 0 is off. Default to off.
  */
 void ccv_nnc_set_memory_efficient(int state);
+/**
+ * Quantize a given memory region of a given datatype / memory resides, into nbits palette.
+ * @param input The input memory region, it can be CCV_64F, CCV_32F or CCV_16F.
+ * @param datatype The datatype, it can be CCV_64F, CCV_32F or CCV_16F.
+ * @param memory_type Where the memory resides. Right now only support CPU_MEMORY.
+ * @param input_length How many elements in the input.
+ * @param qbits How many bits for the palette. Right now only 4 / 5 / 6 / 7 / 8 bits supported.
+ * @param number_in_blocks How many elements share a palette.
+ * @param output The output memory region.
+ * @param output_length The maximum size of the output.
+ * @return The actual length in bytes of the output.
+ */
+CCV_WARN_UNUSED(size_t) ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length);
+/**
+ * Dequantize a given memory region of a given datatype / memory resides, from built-in nbits palette.
+ * @param input The input memory region.
+ * @param datatype The datatype, it can be CCV_64F, CCV_32F or CCV_16F.
+ * @param memory_type Where the memory resides. It can be either CPU_MEMORY or GPU_MEMORY.
+ * @param input_length The size of the input in bytes.
+ * @param qbits How many bits for the palette. Right now only 4 / 5 / 6 / 7 / 8 bits supported.
+ * @param number_in_blocks How many elements share a palette.
+ * @param output The output memory region, it can be CCV_64F, CCV_32F or CCV_16F.
+ * @param output_length How many elements in the output.
+ */
+void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length);
 
 /** @} */
 
diff --git a/lib/nnc/ccv_nnc_tensor_palettize.c b/lib/nnc/ccv_nnc_tensor_palettize.c
new file mode 100644
index 000000000..b1f3cdc90
--- /dev/null
+++ b/lib/nnc/ccv_nnc_tensor_palettize.c
@@ -0,0 +1,952 @@
+#include "ccv_nnc.h"
+#include "ccv_nnc_internal.h"
+
+size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
+{
+	assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F);
+	assert(memory_type == CCV_TENSOR_CPU_MEMORY);
+	const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks;
+	const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
+	uint8_t* const u8 = (uint8_t*)output;
+	uint8_t* const ui = (uint8_t*)input;
+	assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
+	if (qbits == 4)
+	{
+		parallel_for(i, num_blocks) {
+			const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
+			int* const indices = ccmalloc(sizeof(int) * nI);
+			double centroids[16];
+			ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
+			ccv_kmeans1d(&a, 16, indices, centroids);
+			uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i;
+			int j;
+			if (datatype == CCV_16F)
+			{
+				float* f32 = (float*)centroids;
+				for (j = 0; j < 16; j++)
+					f32[j] = (float)centroids[j];
+				ccv_float_to_half_precision(f32, (uint16_t*)u80, 16);
+			} else if (datatype == CCV_32F) {
+				float* f32 = (float*)u80;
+				for (j = 0; j < 16; j++)
+					f32[j] = (float)centroids[j];
+			} else {
+				memcpy(u80, centroids, sizeof(double) * 16);
+			}
+			u80 += 16 * element_size;
+			for (j = 0; j < nI; j += 2)
+			{
+				const uint8_t i0 = (uint8_t)indices[j];
+				const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0;
+				*u80 = (i0 << 4) | i1;
+				++u80;
+			}
+			ccfree(indices);
+		} parallel_endfor
+		return element_size * num_blocks * 16 + (input_length + 1) / 2;
+	} else if (qbits == 5) {
+		parallel_for(i, num_blocks) {
+			const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
+			int* const indices = ccmalloc(sizeof(int) * nI);
+			double centroids[32];
+			ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
+			ccv_kmeans1d(&a, 32, indices, centroids);
+			uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i;
+			int j;
+			if (datatype == CCV_16F)
+			{
+				float* f32 = (float*)centroids;
+				for (j = 0; j < 32; j++)
+					f32[j] = (float)centroids[j];
+				ccv_float_to_half_precision(f32, (uint16_t*)u80, 32);
+			} else if (datatype == CCV_32F) {
+				float* f32 = (float*)u80;
+				for (j = 0; j < 32; j++)
+					f32[j] = (float)centroids[j];
+			} else {
+				memcpy(u80, centroids, sizeof(double) * 32);
+			}
+			u80 += 32 * element_size;
+			for (j = 0; j < nI; j += 8)
+			{
+				const uint8_t i0 = (uint8_t)indices[j];
+				const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0;
+				const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0;
+				const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0;
+				const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 0;
+				const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 0;
+				const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 0;
+				const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7] : 0;
+				u80[0] = (i0 << 3) | (i1 >> 2);
+				u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4);
+				u80[2] = (i3 << 4) | (i4 >> 1);
+				u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3);
+				u80[4] = (i6 << 5) | i7;
+				u80 += 5;
+			}
+			ccfree(indices);
+		} parallel_endfor
+		return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5;
+	} else if (qbits == 6) {
+		parallel_for(i, num_blocks) {
+			const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
+			int* const indices = ccmalloc(sizeof(int) * nI);
+			double centroids[64];
+			ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
+			ccv_kmeans1d(&a, 64, indices, centroids);
+			uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i;
+			int j;
+			if (datatype == CCV_16F)
+			{
+				float* f32 = (float*)centroids;
+				for (j = 0; j < 64; j++)
+					f32[j] = (float)centroids[j];
+				ccv_float_to_half_precision(f32, (uint16_t*)u80, 64);
+			} else if (datatype == CCV_32F) {
+				float* f32 = (float*)u80;
+				for (j = 0; j < 64; j++)
+					f32[j] = (float)centroids[j];
+			} else {
+				memcpy(u80, centroids, sizeof(double) * 64);
+			}
+			u80 += 64 * element_size;
+			for (j = 0; j < nI; j += 4)
+			{
+				const uint8_t i0 = (uint8_t)indices[j];
+				const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0;
+				const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0;
+				const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0;
+				u80[0] = (i0 << 2) | (i1 >> 4);
+				u80[1] = (i1 << 4) | (i2 >> 2);
+				u80[2] = (i2 << 6) | i3;
+				u80 += 3;
+			}
+			ccfree(indices);
+		} parallel_endfor
+		return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3;
+	} else if (qbits == 7) {
+		parallel_for(i, num_blocks) {
+			const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
+			int* const indices = ccmalloc(sizeof(int) * nI);
+			double centroids[128];
+			ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
+			ccv_kmeans1d(&a, 128, indices, centroids);
+			uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i;
+			int j;
+			if (datatype == CCV_16F)
+			{
+				float* f32 = (float*)centroids;
+				for (j = 0; j < 128; j++)
+					f32[j] = (float)centroids[j];
+				ccv_float_to_half_precision(f32, (uint16_t*)u80, 128);
+			} else if (datatype == CCV_32F) {
+				float* f32 = (float*)u80;
+				for (j = 0; j < 128; j++)
+					f32[j] = (float)centroids[j];
+			} else {
+				memcpy(u80, centroids, sizeof(double) * 128);
+			}
+			u80 += 128 * element_size;
+			for (j = 0; j < nI; j += 8)
+			{
+				const uint8_t i0 = (uint8_t)indices[j];
+				const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0;
+				const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0;
+				const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0;
+				const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 0;
+				const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 0;
+				const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 0;
+				const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7] : 0;
+				u80[0] = (i0 << 1) | (i1 >> 6);
+				u80[1] = (i1 << 2) | (i2 >> 5);
+				u80[2] = (i2 << 3) | (i3 >> 4);
+				u80[3] = (i3 << 4) | (i4 >> 3);
+				u80[4] = (i4 << 5) | (i5 >> 2);
+				u80[5] = (i5 << 6) | (i6 >> 1);
+				u80[6] = (i6 << 7) | i7;
+				u80 += 7;
+			}
+			ccfree(indices);
+		} parallel_endfor
+		return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7;
+	} else {
+		parallel_for(i, num_blocks) {
+			const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks);
+			int* const indices = ccmalloc(sizeof(int) * nI);
+			double centroids[256];
+			ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0);
+			ccv_kmeans1d(&a, 256, indices, centroids);
+			uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i;
+			int j;
+			if (datatype == CCV_16F)
+			{
+				float* f32 = (float*)centroids;
+				for (j = 0; j < 256; j++)
+					f32[j] = (float)centroids[j];
+				ccv_float_to_half_precision(f32, (uint16_t*)u80, 256);
+			} else if (datatype == CCV_32F) {
+				float* f32 = (float*)u80;
+				for (j = 0; j < 256; j++)
+					f32[j] = (float)centroids[j];
+			} else {
+				memcpy(u80, centroids, sizeof(double) * 256);
+			}
+			u80 += 256 * element_size;
+			for (j = 0; j < nI; j++)
+			{
+				*u80 = (uint8_t)indices[j];
+				++u80;
+			}
+			ccfree(indices);
+		} parallel_endfor
+		return element_size * num_blocks * 256 + input_length;
+	}
+}
+
+void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length)
+{
+	assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F);
+	assert(memory_type == CCV_TENSOR_CPU_MEMORY);
+	const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks;
+	const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype);
+	uint8_t* const u8 = (uint8_t*)output;
+	const uint8_t* const ui = (const uint8_t*)input;
+	assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8);
+	if (datatype == CCV_16F)
+	{
+		if (qbits == 4)
+		{
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const uint16_t* const palette = (uint16_t*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 16;
+				uint16_t* const f16 = (uint16_t*)u80;
+				int j;
+				if (nI % 2 == 0)
+				{
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f16[j] = palette[i0];
+						f16[j + 1] = palette[i1];
+						++ui1;
+					}
+				} else {
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f16[j] = palette[i0];
+						if (j + 1 < nI)
+							f16[j + 1] = palette[i1];
+						++ui1;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 5) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const uint16_t* const palette = (uint16_t*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 32;
+				uint16_t* const f16 = (uint16_t*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f16[j] = palette[i0];
+						f16[j + 1] = palette[i1];
+						f16[j + 2] = palette[i2];
+						f16[j + 3] = palette[i3];
+						f16[j + 4] = palette[i4];
+						f16[j + 5] = palette[i5];
+						f16[j + 6] = palette[i6];
+						f16[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f16[j] = palette[i0];
+						if (j + 1 < nI)
+							f16[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f16[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f16[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f16[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f16[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f16[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f16[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 6) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const uint16_t* const palette = (uint16_t*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 64;
+				uint16_t* const f16 = (uint16_t*)u80;
+				int j;
+				if (nI % 4 == 0)
+				{
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f16[j] = palette[i0];
+						f16[j + 1] = palette[i1];
+						f16[j + 2] = palette[i2];
+						f16[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				} else {
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f16[j] = palette[i0];
+						if (j + 1 < nI)
+							f16[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f16[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f16[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 7) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const uint16_t* const palette = (uint16_t*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 128;
+				uint16_t* const f16 = (uint16_t*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f16[j] = palette[i0];
+						f16[j + 1] = palette[i1];
+						f16[j + 2] = palette[i2];
+						f16[j + 3] = palette[i3];
+						f16[j + 4] = palette[i4];
+						f16[j + 5] = palette[i5];
+						f16[j + 6] = palette[i6];
+						f16[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f16[j] = palette[i0];
+						if (j + 1 < nI)
+							f16[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f16[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f16[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f16[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f16[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f16[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f16[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				}
+			} parallel_endfor
+		} else {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const uint16_t* const palette = (uint16_t*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 256;
+				uint16_t* const f16 = (uint16_t*)u80;
+				int j;
+				for (j = 0; j < nI; j++)
+				{
+					const uint8_t u0 = *ui1;
+					f16[j] = palette[u0];
+					++ui1;
+				}
+			} parallel_endfor
+		}
+	} else if (datatype == CCV_32F) {
+		if (qbits == 4)
+		{
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const float* const palette = (float*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 16;
+				float* const f32 = (float*)u80;
+				int j;
+				if (nI % 2 == 0)
+				{
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f32[j] = palette[i0];
+						f32[j + 1] = palette[i1];
+						++ui1;
+					}
+				} else {
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f32[j] = palette[i0];
+						if (j + 1 < nI)
+							f32[j + 1] = palette[i1];
+						++ui1;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 5) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const float* const palette = (float*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 32;
+				float* const f32 = (float*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f32[j] = palette[i0];
+						f32[j + 1] = palette[i1];
+						f32[j + 2] = palette[i2];
+						f32[j + 3] = palette[i3];
+						f32[j + 4] = palette[i4];
+						f32[j + 5] = palette[i5];
+						f32[j + 6] = palette[i6];
+						f32[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f32[j] = palette[i0];
+						if (j + 1 < nI)
+							f32[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f32[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f32[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f32[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f32[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f32[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f32[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 6) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const float* const palette = (float*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 64;
+				float* const f32 = (float*)u80;
+				int j;
+				if (nI % 4 == 0)
+				{
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f32[j] = palette[i0];
+						f32[j + 1] = palette[i1];
+						f32[j + 2] = palette[i2];
+						f32[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				} else {
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f32[j] = palette[i0];
+						if (j + 1 < nI)
+							f32[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f32[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f32[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 7) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const float* const palette = (float*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 128;
+				float* const f32 = (float*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f32[j] = palette[i0];
+						f32[j + 1] = palette[i1];
+						f32[j + 2] = palette[i2];
+						f32[j + 3] = palette[i3];
+						f32[j + 4] = palette[i4];
+						f32[j + 5] = palette[i5];
+						f32[j + 6] = palette[i6];
+						f32[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f32[j] = palette[i0];
+						if (j + 1 < nI)
+							f32[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f32[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f32[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f32[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f32[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f32[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f32[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				}
+			} parallel_endfor
+		} else {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const float* const palette = (float*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 256;
+				float* const f32 = (float*)u80;
+				int j;
+				for (j = 0; j < nI; j++)
+				{
+					const uint8_t u0 = *ui1;
+					f32[j] = palette[u0];
+					++ui1;
+				}
+			} parallel_endfor
+		}
+	} else {
+		if (qbits == 4)
+		{
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const double* const palette = (double*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 16;
+				double* const f64 = (double*)u80;
+				int j;
+				if (nI % 2 == 0)
+				{
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f64[j] = palette[i0];
+						f64[j + 1] = palette[i1];
+						++ui1;
+					}
+				} else {
+					for (j = 0; j < nI; j += 2)
+					{
+						const uint8_t u0 = *ui1;
+						const int i0 = (int)(u0 >> 4);
+						const int i1 = (int)(u0 & 15);
+						f64[j] = palette[i0];
+						if (j + 1 < nI)
+							f64[j + 1] = palette[i1];
+						++ui1;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 5) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const double* const palette = (double*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 32;
+				double* const f64 = (double*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f64[j] = palette[i0];
+						f64[j + 1] = palette[i1];
+						f64[j + 2] = palette[i2];
+						f64[j + 3] = palette[i3];
+						f64[j + 4] = palette[i4];
+						f64[j + 5] = palette[i5];
+						f64[j + 6] = palette[i6];
+						f64[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const int i0 = (int)(u0 >> 3);
+						const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6));
+						const int i2 = (int)((u1 >> 1) & 31);
+						const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4));
+						const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7));
+						const int i5 = (int)((u3 >> 2) & 31);
+						const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5));
+						const int i7 = (int)(u4 & 31);
+						f64[j] = palette[i0];
+						if (j + 1 < nI)
+							f64[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f64[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f64[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f64[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f64[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f64[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f64[j + 7] = palette[i7];
+						ui1 += 5;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 6) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const double* const palette = (double*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 64;
+				double* const f64 = (double*)u80;
+				int j;
+				if (nI % 4 == 0)
+				{
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f64[j] = palette[i0];
+						f64[j + 1] = palette[i1];
+						f64[j + 2] = palette[i2];
+						f64[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				} else {
+					for (j = 0; j < nI; j += 4)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const int i0 = (int)(u0 >> 2);
+						const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4));
+						const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6));
+						const int i3 = (int)(u2 & 63);
+						f64[j] = palette[i0];
+						if (j + 1 < nI)
+							f64[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f64[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f64[j + 3] = palette[i3];
+						ui1 += 3;
+					}
+				}
+			} parallel_endfor
+		} else if (qbits == 7) {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const double* const palette = (double*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 128;
+				double* const f64 = (double*)u80;
+				int j;
+				if (nI % 8 == 0)
+				{
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f64[j] = palette[i0];
+						f64[j + 1] = palette[i1];
+						f64[j + 2] = palette[i2];
+						f64[j + 3] = palette[i3];
+						f64[j + 4] = palette[i4];
+						f64[j + 5] = palette[i5];
+						f64[j + 6] = palette[i6];
+						f64[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				} else {
+					for (j = 0; j < nI; j += 8)
+					{
+						const uint8_t u0 = ui1[0];
+						const uint8_t u1 = ui1[1];
+						const uint8_t u2 = ui1[2];
+						const uint8_t u3 = ui1[3];
+						const uint8_t u4 = ui1[4];
+						const uint8_t u5 = ui1[5];
+						const uint8_t u6 = ui1[6];
+						const int i0 = (int)(u0 >> 1);
+						const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2));
+						const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3));
+						const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4));
+						const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5));
+						const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6));
+						const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7));
+						const int i7 = (int)(u6 & 127);
+						f64[j] = palette[i0];
+						if (j + 1 < nI)
+							f64[j + 1] = palette[i1];
+						if (j + 2 < nI)
+							f64[j + 2] = palette[i2];
+						if (j + 3 < nI)
+							f64[j + 3] = palette[i3];
+						if (j + 4 < nI)
+							f64[j + 4] = palette[i4];
+						if (j + 5 < nI)
+							f64[j + 5] = palette[i5];
+						if (j + 6 < nI)
+							f64[j + 6] = palette[i6];
+						if (j + 7 < nI)
+							f64[j + 7] = palette[i7];
+						ui1 += 7;
+					}
+				}
+			} parallel_endfor
+		} else {
+			parallel_for(i, num_blocks) {
+				const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks);
+				const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i;
+				uint8_t* const u80 = u8 + element_size * number_in_blocks * i;
+				const double* const palette = (double*)ui0;
+				const uint8_t* ui1 = ui0 + element_size * 256;
+				double* const f64 = (double*)u80;
+				int j;
+				for (j = 0; j < nI; j++)
+				{
+					const uint8_t u0 = *ui1;
+					f64[j] = palette[u0];
+					++ui1;
+				}
+			} parallel_endfor
+		}
+	}
+}
diff --git a/lib/nnc/makefile b/lib/nnc/makefile
index 87ffd35e1..978d745d7 100644
--- a/lib/nnc/makefile
+++ b/lib/nnc/makefile
@@ -3,7 +3,7 @@ include ../config.mk
 CFLAGS := -O3 -Wall -I"../" $(CFLAGS)
 NVFLAGS := -O3 $(NVFLAGS)
 
-SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c
+SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c ccv_nnc_tensor_palettize.c
 
 SRC_OBJS := $(patsubst %.c,%.o,$(SRCS))
 
diff --git a/test/unit/nnc/makefile b/test/unit/nnc/makefile
index 4eb8b2359..982b1d460 100644
--- a/test/unit/nnc/makefile
+++ b/test/unit/nnc/makefile
@@ -3,7 +3,7 @@ include ../../../lib/config.mk
 export LSAN_OPTIONS=suppressions=known-leaks.txt
 LDFLAGS := -L"../../../lib" -lccv $(LDFLAGS)
 CFLAGS := -O3 -Wall -I"../../../lib" -I"../../" $(CFLAGS)
-TARGETS = tfb.tests tensor.tests forward.tests backward.tests gradient.tests graph.tests winograd.tests transform.tests symbolic.graph.tests autograd.tests autograd.vector.tests while.tests tape.tests while.backward.tests case_of.tests case_of.backward.tests numa.tests tensor.bind.tests broadcast.tests reduce.tests batch.norm.tests layer.norm.tests dropout.tests crossentropy.tests dynamic.graph.tests simplify.tests symbolic.graph.compile.tests rand.tests graph.io.tests cnnp.core.tests minimize.tests custom.tests parallel.tests dataframe.tests dataframe.addons.tests compression.tests gemm.tests index.tests swish.tests upsample.tests smooth_l1.tests roi_align.tests nms.tests compare.tests concat.tests cblas.tests micro.tests loss.tests histogram.tests group.norm.tests gelu.tests attention.tests
+TARGETS = tfb.tests tensor.tests forward.tests backward.tests gradient.tests graph.tests winograd.tests transform.tests symbolic.graph.tests autograd.tests autograd.vector.tests while.tests tape.tests while.backward.tests case_of.tests case_of.backward.tests numa.tests tensor.bind.tests broadcast.tests reduce.tests batch.norm.tests layer.norm.tests dropout.tests crossentropy.tests dynamic.graph.tests simplify.tests symbolic.graph.compile.tests rand.tests graph.io.tests cnnp.core.tests minimize.tests custom.tests parallel.tests dataframe.tests dataframe.addons.tests compression.tests gemm.tests index.tests swish.tests upsample.tests smooth_l1.tests roi_align.tests nms.tests compare.tests concat.tests cblas.tests micro.tests loss.tests histogram.tests group.norm.tests gelu.tests attention.tests palettize.tests
 
 ifeq ($(shell uname 2>/dev/null || echo Unknown),Darwin)
 	LDFLAGS += -Wl,-U,___test_case_setup,-U,___test_case_teardown
diff --git a/test/unit/nnc/palettize.tests.c b/test/unit/nnc/palettize.tests.c
new file mode 100644
index 000000000..495b98188
--- /dev/null
+++ b/test/unit/nnc/palettize.tests.c
@@ -0,0 +1,610 @@
+#include "case.h"
+#include "ccv_case.h"
+#include "ccv_nnc_case.h"
+#include <ccv.h>
+#include <nnc/ccv_nnc.h>
+#include <nnc/ccv_nnc_easy.h>
+#include "3rdparty/dsfmt/dSFMT.h"
+
+TEST_SETUP()
+{
+	ccv_nnc_init();
+}
+
+TEST_CASE("quantize double to 4-bit and dequantize on CPU losslessly")
+{
+	double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	double* const values = ccmalloc(sizeof(double) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944);
+	REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 4-bit and dequantize on CPU losslessly")
+{
+	float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	float* const values = ccmalloc(sizeof(float) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2);
+	REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
+	float* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 4-bit and dequantize on CPU losslessly")
+{
+	float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	uint16_t lut[16];
+	ccv_float_to_half_precision(lut_f32, lut, 16);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4);
+	REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 5-bit and dequantize on CPU losslessly")
+{
+	double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	double* const values = ccmalloc(sizeof(double) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 5-bit and dequantize on CPU losslessly")
+{
+	float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	float* const values = ccmalloc(sizeof(float) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 5-bit and dequantize on CPU losslessly")
+{
+	float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	uint16_t lut[32];
+	ccv_float_to_half_precision(lut_f32, lut, 32);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
+	int i;
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 6-bit and dequantize on CPU losslessly")
+{
+	double lut[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 6-bit and dequantize on CPU losslessly")
+{
+	float lut[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 6-bit and dequantize on CPU losslessly")
+{
+	float lut_f32[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[64];
+	ccv_float_to_half_precision(lut_f32, lut, 64);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 7-bit and dequantize on CPU losslessly")
+{
+	double lut[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 7-bit and dequantize on CPU losslessly")
+{
+	float lut[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 7-bit and dequantize on CPU losslessly")
+{
+	float lut_f32[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[128];
+	ccv_float_to_half_precision(lut_f32, lut, 128);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 8-bit and dequantize on CPU losslessly")
+{
+	double lut[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8);
+	REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 8-bit and dequantize on CPU losslessly")
+{
+	float lut[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4);
+	REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 8-bit and dequantize on CPU losslessly")
+{
+	float lut_f32[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[256];
+	ccv_float_to_half_precision(lut_f32, lut, 256);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839);
+	for (i = 0; i < 2839; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2);
+	REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 4-bit and dequantize on CPU losslessly, fast path")
+{
+	double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	double* const values = ccmalloc(sizeof(double) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944);
+	REQUIRE_EQ(output_size, 1420 + 2944, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 4-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	float* const values = ccmalloc(sizeof(float) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2);
+	REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match");
+	float* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 4-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
+	uint16_t lut[16];
+	ccv_float_to_half_precision(lut_f32, lut, 16);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 16];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4);
+	REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 5-bit and dequantize on CPU losslessly, fast path")
+{
+	double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	double* const values = ccmalloc(sizeof(double) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 5-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	float* const values = ccmalloc(sizeof(float) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 5-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0};
+	uint16_t lut[32];
+	ccv_float_to_half_precision(lut_f32, lut, 32);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
+	int i;
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 32];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2);
+	REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 6-bit and dequantize on CPU losslessly, fast path")
+{
+	double lut[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 6-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 6-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut_f32[64];
+	int i;
+	for (i = 0; i < 64; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[64];
+	ccv_float_to_half_precision(lut_f32, lut, 64);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 64];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 2);
+	REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 7-bit and dequantize on CPU losslessly, fast path")
+{
+	double lut[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 7-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 7-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut_f32[128];
+	int i;
+	for (i = 0; i < 128; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[128];
+	ccv_float_to_half_precision(lut_f32, lut, 128);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 128];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2);
+	REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize double to 8-bit and dequantize on CPU losslessly, fast path")
+{
+	double lut[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut[i] = (double)i;
+	double* const values = ccmalloc(sizeof(double) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 8));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8);
+	REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match");
+	double* const output_values = ccmalloc(sizeof(double) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840);
+	REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize float to 8-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut[i] = (float)i;
+	float* const values = ccmalloc(sizeof(float) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 4));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 4);
+	REQUIRE_EQ(output_size, 2840 + 3 * 256 * 4, "output size should match");
+	float* const output_values = ccmalloc(sizeof(float) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840);
+	REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+TEST_CASE("quantize half-precision to 8-bit and dequantize on CPU losslessly, fast path")
+{
+	float lut_f32[256];
+	int i;
+	for (i = 0; i < 256; i++)
+		lut_f32[i] = (float)i;
+	uint16_t lut[256];
+	ccv_float_to_half_precision(lut_f32, lut, 256);
+	uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840);
+	for (i = 0; i < 2840; i++)
+		values[i] = lut[i % 256];
+	uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 2));
+	const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2);
+	REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match");
+	uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840);
+	ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840);
+	REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless");
+	ccfree(values);
+	ccfree(output_values);
+	ccfree(compressed);
+}
+
+#include "case_main.h"