diff --git a/lib/BUILD.bazel b/lib/BUILD.bazel index 72143cf2e..009ddf8f7 100644 --- a/lib/BUILD.bazel +++ b/lib/BUILD.bazel @@ -474,6 +474,7 @@ cc_library( "nnc/_ccv_cnnp_model.h", "nnc/ccv_nnc_tensor.c", "nnc/ccv_nnc_tensor_io.c", + "nnc/ccv_nnc_tensor_palettize.c", "nnc/ccv_nnc_tensor_tape.c", "nnc/ccv_nnc_cmd.c", "nnc/ccv_nnc_stream.c", diff --git a/lib/nnc/ccv_nnc.h b/lib/nnc/ccv_nnc.h index c876ed271..8ee3e54ac 100644 --- a/lib/nnc/ccv_nnc.h +++ b/lib/nnc/ccv_nnc.h @@ -844,6 +844,31 @@ void ccv_nnc_set_profiler(int state); * @param state 1 is on, 0 is off. Default to off. */ void ccv_nnc_set_memory_efficient(int state); +/** + * Quantize a given memory region of a given datatype / memory resides, into nbits palette. + * @param input The input memory region, it can be CCV_64F, CCV_32F or CCV_16F. + * @param datatype The datatype, it can be CCV_64F, CCV_32F or CCV_16F. + * @param memory_type Where the memory resides. Right now only support CPU_MEMORY. + * @param input_length How many elements in the input. + * @param qbits How many bits for the palette. Right now only 4 / 5 / 6 / 7 / 8 bits supported. + * @param number_in_blocks How many elements share a palette. + * @param output The output memory region. + * @param output_length The maximum size of the output. + * @return The actual length in bytes of the output. + */ +CCV_WARN_UNUSED(size_t) ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length); +/** + * Dequantize a given memory region of a given datatype / memory resides, from built-in nbits palette. + * @param input The input memory region. + * @param datatype The datatype, it can be CCV_64F, CCV_32F or CCV_16F. + * @param memory_type Where the memory resides. It can be either CPU_MEMORY or GPU_MEMORY. + * @param input_length The size of the input in bytes. + * @param qbits How many bits for the palette. Right now only 4 / 5 / 6 / 7 / 8 bits supported. + * @param number_in_blocks How many elements share a palette. + * @param output The output memory region, it can be CCV_64F, CCV_32F or CCV_16F. + * @param output_length How many elements in the output. + */ +void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length); /** @} */ diff --git a/lib/nnc/ccv_nnc_tensor_palettize.c b/lib/nnc/ccv_nnc_tensor_palettize.c new file mode 100644 index 000000000..b1f3cdc90 --- /dev/null +++ b/lib/nnc/ccv_nnc_tensor_palettize.c @@ -0,0 +1,952 @@ +#include "ccv_nnc.h" +#include "ccv_nnc_internal.h" + +size_t ccv_nnc_palettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) +{ + assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F); + assert(memory_type == CCV_TENSOR_CPU_MEMORY); + const int num_blocks = (input_length + number_in_blocks - 1) / number_in_blocks; + const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); + uint8_t* const u8 = (uint8_t*)output; + uint8_t* const ui = (uint8_t*)input; + assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); + if (qbits == 4) + { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); + int* const indices = ccmalloc(sizeof(int) * nI); + double centroids[16]; + ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); + ccv_kmeans1d(&a, 16, indices, centroids); + uint8_t* u80 = u8 + (16 * element_size + number_in_blocks / 2) * i; + int j; + if (datatype == CCV_16F) + { + float* f32 = (float*)centroids; + for (j = 0; j < 16; j++) + f32[j] = (float)centroids[j]; + ccv_float_to_half_precision(f32, (uint16_t*)u80, 16); + } else if (datatype == CCV_32F) { + float* f32 = (float*)u80; + for (j = 0; j < 16; j++) + f32[j] = (float)centroids[j]; + } else { + memcpy(u80, centroids, sizeof(double) * 16); + } + u80 += 16 * element_size; + for (j = 0; j < nI; j += 2) + { + const uint8_t i0 = (uint8_t)indices[j]; + const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0; + *u80 = (i0 << 4) | i1; + ++u80; + } + ccfree(indices); + } parallel_endfor + return element_size * num_blocks * 16 + (input_length + 1) / 2; + } else if (qbits == 5) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); + int* const indices = ccmalloc(sizeof(int) * nI); + double centroids[32]; + ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); + ccv_kmeans1d(&a, 32, indices, centroids); + uint8_t* u80 = u8 + (32 * element_size + number_in_blocks / 8 * 5) * i; + int j; + if (datatype == CCV_16F) + { + float* f32 = (float*)centroids; + for (j = 0; j < 32; j++) + f32[j] = (float)centroids[j]; + ccv_float_to_half_precision(f32, (uint16_t*)u80, 32); + } else if (datatype == CCV_32F) { + float* f32 = (float*)u80; + for (j = 0; j < 32; j++) + f32[j] = (float)centroids[j]; + } else { + memcpy(u80, centroids, sizeof(double) * 32); + } + u80 += 32 * element_size; + for (j = 0; j < nI; j += 8) + { + const uint8_t i0 = (uint8_t)indices[j]; + const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0; + const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0; + const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0; + const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 0; + const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 0; + const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 0; + const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7] : 0; + u80[0] = (i0 << 3) | (i1 >> 2); + u80[1] = (i1 << 6) | (i2 << 1) | (i3 >> 4); + u80[2] = (i3 << 4) | (i4 >> 1); + u80[3] = (i4 << 7) | (i5 << 2) | (i6 >> 3); + u80[4] = (i6 << 5) | i7; + u80 += 5; + } + ccfree(indices); + } parallel_endfor + return element_size * num_blocks * 32 + (input_length + 7) / 8 * 5; + } else if (qbits == 6) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); + int* const indices = ccmalloc(sizeof(int) * nI); + double centroids[64]; + ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); + ccv_kmeans1d(&a, 64, indices, centroids); + uint8_t* u80 = u8 + (64 * element_size + number_in_blocks / 4 * 3) * i; + int j; + if (datatype == CCV_16F) + { + float* f32 = (float*)centroids; + for (j = 0; j < 64; j++) + f32[j] = (float)centroids[j]; + ccv_float_to_half_precision(f32, (uint16_t*)u80, 64); + } else if (datatype == CCV_32F) { + float* f32 = (float*)u80; + for (j = 0; j < 64; j++) + f32[j] = (float)centroids[j]; + } else { + memcpy(u80, centroids, sizeof(double) * 64); + } + u80 += 64 * element_size; + for (j = 0; j < nI; j += 4) + { + const uint8_t i0 = (uint8_t)indices[j]; + const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0; + const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0; + const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0; + u80[0] = (i0 << 2) | (i1 >> 4); + u80[1] = (i1 << 4) | (i2 >> 2); + u80[2] = (i2 << 6) | i3; + u80 += 3; + } + ccfree(indices); + } parallel_endfor + return element_size * num_blocks * 64 + (input_length + 3) / 4 * 3; + } else if (qbits == 7) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); + int* const indices = ccmalloc(sizeof(int) * nI); + double centroids[128]; + ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); + ccv_kmeans1d(&a, 128, indices, centroids); + uint8_t* u80 = u8 + (128 * element_size + number_in_blocks / 8 * 7) * i; + int j; + if (datatype == CCV_16F) + { + float* f32 = (float*)centroids; + for (j = 0; j < 128; j++) + f32[j] = (float)centroids[j]; + ccv_float_to_half_precision(f32, (uint16_t*)u80, 128); + } else if (datatype == CCV_32F) { + float* f32 = (float*)u80; + for (j = 0; j < 128; j++) + f32[j] = (float)centroids[j]; + } else { + memcpy(u80, centroids, sizeof(double) * 128); + } + u80 += 128 * element_size; + for (j = 0; j < nI; j += 8) + { + const uint8_t i0 = (uint8_t)indices[j]; + const uint8_t i1 = j + 1 < nI ? (uint8_t)indices[j + 1] : 0; + const uint8_t i2 = j + 2 < nI ? (uint8_t)indices[j + 2] : 0; + const uint8_t i3 = j + 3 < nI ? (uint8_t)indices[j + 3] : 0; + const uint8_t i4 = j + 4 < nI ? (uint8_t)indices[j + 4] : 0; + const uint8_t i5 = j + 5 < nI ? (uint8_t)indices[j + 5] : 0; + const uint8_t i6 = j + 6 < nI ? (uint8_t)indices[j + 6] : 0; + const uint8_t i7 = j + 7 < nI ? (uint8_t)indices[j + 7] : 0; + u80[0] = (i0 << 1) | (i1 >> 6); + u80[1] = (i1 << 2) | (i2 >> 5); + u80[2] = (i2 << 3) | (i3 >> 4); + u80[3] = (i3 << 4) | (i4 >> 3); + u80[4] = (i4 << 5) | (i5 >> 2); + u80[5] = (i5 << 6) | (i6 >> 1); + u80[6] = (i6 << 7) | i7; + u80 += 7; + } + ccfree(indices); + } parallel_endfor + return element_size * num_blocks * 128 + (input_length + 7) / 8 * 7; + } else { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, input_length - i * number_in_blocks); + int* const indices = ccmalloc(sizeof(int) * nI); + double centroids[256]; + ccv_dense_matrix_t a = ccv_dense_matrix(1, nI, datatype | CCV_C1, ui + element_size * number_in_blocks * i, 0); + ccv_kmeans1d(&a, 256, indices, centroids); + uint8_t* u80 = u8 + (256 * element_size + number_in_blocks) * i; + int j; + if (datatype == CCV_16F) + { + float* f32 = (float*)centroids; + for (j = 0; j < 256; j++) + f32[j] = (float)centroids[j]; + ccv_float_to_half_precision(f32, (uint16_t*)u80, 256); + } else if (datatype == CCV_32F) { + float* f32 = (float*)u80; + for (j = 0; j < 256; j++) + f32[j] = (float)centroids[j]; + } else { + memcpy(u80, centroids, sizeof(double) * 256); + } + u80 += 256 * element_size; + for (j = 0; j < nI; j++) + { + *u80 = (uint8_t)indices[j]; + ++u80; + } + ccfree(indices); + } parallel_endfor + return element_size * num_blocks * 256 + input_length; + } +} + +void ccv_nnc_depalettize(const void* input, const int datatype, const int memory_type, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length) +{ + assert(datatype == CCV_16F || datatype == CCV_32F || datatype == CCV_64F); + assert(memory_type == CCV_TENSOR_CPU_MEMORY); + const int num_blocks = (output_length + number_in_blocks - 1) / number_in_blocks; + const size_t element_size = CCV_GET_DATA_TYPE_SIZE(datatype); + uint8_t* const u8 = (uint8_t*)output; + const uint8_t* const ui = (const uint8_t*)input; + assert(qbits == 4 || qbits == 5 || qbits == 6 || qbits == 7 || qbits == 8); + if (datatype == CCV_16F) + { + if (qbits == 4) + { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const uint16_t* const palette = (uint16_t*)ui0; + const uint8_t* ui1 = ui0 + element_size * 16; + uint16_t* const f16 = (uint16_t*)u80; + int j; + if (nI % 2 == 0) + { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f16[j] = palette[i0]; + f16[j + 1] = palette[i1]; + ++ui1; + } + } else { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f16[j] = palette[i0]; + if (j + 1 < nI) + f16[j + 1] = palette[i1]; + ++ui1; + } + } + } parallel_endfor + } else if (qbits == 5) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const uint16_t* const palette = (uint16_t*)ui0; + const uint8_t* ui1 = ui0 + element_size * 32; + uint16_t* const f16 = (uint16_t*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f16[j] = palette[i0]; + f16[j + 1] = palette[i1]; + f16[j + 2] = palette[i2]; + f16[j + 3] = palette[i3]; + f16[j + 4] = palette[i4]; + f16[j + 5] = palette[i5]; + f16[j + 6] = palette[i6]; + f16[j + 7] = palette[i7]; + ui1 += 5; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f16[j] = palette[i0]; + if (j + 1 < nI) + f16[j + 1] = palette[i1]; + if (j + 2 < nI) + f16[j + 2] = palette[i2]; + if (j + 3 < nI) + f16[j + 3] = palette[i3]; + if (j + 4 < nI) + f16[j + 4] = palette[i4]; + if (j + 5 < nI) + f16[j + 5] = palette[i5]; + if (j + 6 < nI) + f16[j + 6] = palette[i6]; + if (j + 7 < nI) + f16[j + 7] = palette[i7]; + ui1 += 5; + } + } + } parallel_endfor + } else if (qbits == 6) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const uint16_t* const palette = (uint16_t*)ui0; + const uint8_t* ui1 = ui0 + element_size * 64; + uint16_t* const f16 = (uint16_t*)u80; + int j; + if (nI % 4 == 0) + { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f16[j] = palette[i0]; + f16[j + 1] = palette[i1]; + f16[j + 2] = palette[i2]; + f16[j + 3] = palette[i3]; + ui1 += 3; + } + } else { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f16[j] = palette[i0]; + if (j + 1 < nI) + f16[j + 1] = palette[i1]; + if (j + 2 < nI) + f16[j + 2] = palette[i2]; + if (j + 3 < nI) + f16[j + 3] = palette[i3]; + ui1 += 3; + } + } + } parallel_endfor + } else if (qbits == 7) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const uint16_t* const palette = (uint16_t*)ui0; + const uint8_t* ui1 = ui0 + element_size * 128; + uint16_t* const f16 = (uint16_t*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f16[j] = palette[i0]; + f16[j + 1] = palette[i1]; + f16[j + 2] = palette[i2]; + f16[j + 3] = palette[i3]; + f16[j + 4] = palette[i4]; + f16[j + 5] = palette[i5]; + f16[j + 6] = palette[i6]; + f16[j + 7] = palette[i7]; + ui1 += 7; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f16[j] = palette[i0]; + if (j + 1 < nI) + f16[j + 1] = palette[i1]; + if (j + 2 < nI) + f16[j + 2] = palette[i2]; + if (j + 3 < nI) + f16[j + 3] = palette[i3]; + if (j + 4 < nI) + f16[j + 4] = palette[i4]; + if (j + 5 < nI) + f16[j + 5] = palette[i5]; + if (j + 6 < nI) + f16[j + 6] = palette[i6]; + if (j + 7 < nI) + f16[j + 7] = palette[i7]; + ui1 += 7; + } + } + } parallel_endfor + } else { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const uint16_t* const palette = (uint16_t*)ui0; + const uint8_t* ui1 = ui0 + element_size * 256; + uint16_t* const f16 = (uint16_t*)u80; + int j; + for (j = 0; j < nI; j++) + { + const uint8_t u0 = *ui1; + f16[j] = palette[u0]; + ++ui1; + } + } parallel_endfor + } + } else if (datatype == CCV_32F) { + if (qbits == 4) + { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const float* const palette = (float*)ui0; + const uint8_t* ui1 = ui0 + element_size * 16; + float* const f32 = (float*)u80; + int j; + if (nI % 2 == 0) + { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f32[j] = palette[i0]; + f32[j + 1] = palette[i1]; + ++ui1; + } + } else { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f32[j] = palette[i0]; + if (j + 1 < nI) + f32[j + 1] = palette[i1]; + ++ui1; + } + } + } parallel_endfor + } else if (qbits == 5) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const float* const palette = (float*)ui0; + const uint8_t* ui1 = ui0 + element_size * 32; + float* const f32 = (float*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f32[j] = palette[i0]; + f32[j + 1] = palette[i1]; + f32[j + 2] = palette[i2]; + f32[j + 3] = palette[i3]; + f32[j + 4] = palette[i4]; + f32[j + 5] = palette[i5]; + f32[j + 6] = palette[i6]; + f32[j + 7] = palette[i7]; + ui1 += 5; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f32[j] = palette[i0]; + if (j + 1 < nI) + f32[j + 1] = palette[i1]; + if (j + 2 < nI) + f32[j + 2] = palette[i2]; + if (j + 3 < nI) + f32[j + 3] = palette[i3]; + if (j + 4 < nI) + f32[j + 4] = palette[i4]; + if (j + 5 < nI) + f32[j + 5] = palette[i5]; + if (j + 6 < nI) + f32[j + 6] = palette[i6]; + if (j + 7 < nI) + f32[j + 7] = palette[i7]; + ui1 += 5; + } + } + } parallel_endfor + } else if (qbits == 6) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const float* const palette = (float*)ui0; + const uint8_t* ui1 = ui0 + element_size * 64; + float* const f32 = (float*)u80; + int j; + if (nI % 4 == 0) + { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f32[j] = palette[i0]; + f32[j + 1] = palette[i1]; + f32[j + 2] = palette[i2]; + f32[j + 3] = palette[i3]; + ui1 += 3; + } + } else { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f32[j] = palette[i0]; + if (j + 1 < nI) + f32[j + 1] = palette[i1]; + if (j + 2 < nI) + f32[j + 2] = palette[i2]; + if (j + 3 < nI) + f32[j + 3] = palette[i3]; + ui1 += 3; + } + } + } parallel_endfor + } else if (qbits == 7) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const float* const palette = (float*)ui0; + const uint8_t* ui1 = ui0 + element_size * 128; + float* const f32 = (float*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f32[j] = palette[i0]; + f32[j + 1] = palette[i1]; + f32[j + 2] = palette[i2]; + f32[j + 3] = palette[i3]; + f32[j + 4] = palette[i4]; + f32[j + 5] = palette[i5]; + f32[j + 6] = palette[i6]; + f32[j + 7] = palette[i7]; + ui1 += 7; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f32[j] = palette[i0]; + if (j + 1 < nI) + f32[j + 1] = palette[i1]; + if (j + 2 < nI) + f32[j + 2] = palette[i2]; + if (j + 3 < nI) + f32[j + 3] = palette[i3]; + if (j + 4 < nI) + f32[j + 4] = palette[i4]; + if (j + 5 < nI) + f32[j + 5] = palette[i5]; + if (j + 6 < nI) + f32[j + 6] = palette[i6]; + if (j + 7 < nI) + f32[j + 7] = palette[i7]; + ui1 += 7; + } + } + } parallel_endfor + } else { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const float* const palette = (float*)ui0; + const uint8_t* ui1 = ui0 + element_size * 256; + float* const f32 = (float*)u80; + int j; + for (j = 0; j < nI; j++) + { + const uint8_t u0 = *ui1; + f32[j] = palette[u0]; + ++ui1; + } + } parallel_endfor + } + } else { + if (qbits == 4) + { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 16 + number_in_blocks / 2) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const double* const palette = (double*)ui0; + const uint8_t* ui1 = ui0 + element_size * 16; + double* const f64 = (double*)u80; + int j; + if (nI % 2 == 0) + { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f64[j] = palette[i0]; + f64[j + 1] = palette[i1]; + ++ui1; + } + } else { + for (j = 0; j < nI; j += 2) + { + const uint8_t u0 = *ui1; + const int i0 = (int)(u0 >> 4); + const int i1 = (int)(u0 & 15); + f64[j] = palette[i0]; + if (j + 1 < nI) + f64[j + 1] = palette[i1]; + ++ui1; + } + } + } parallel_endfor + } else if (qbits == 5) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 32 + number_in_blocks / 8 * 5) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const double* const palette = (double*)ui0; + const uint8_t* ui1 = ui0 + element_size * 32; + double* const f64 = (double*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f64[j] = palette[i0]; + f64[j + 1] = palette[i1]; + f64[j + 2] = palette[i2]; + f64[j + 3] = palette[i3]; + f64[j + 4] = palette[i4]; + f64[j + 5] = palette[i5]; + f64[j + 6] = palette[i6]; + f64[j + 7] = palette[i7]; + ui1 += 5; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const int i0 = (int)(u0 >> 3); + const int i1 = (int)(((u0 & 7) << 2) | (u1 >> 6)); + const int i2 = (int)((u1 >> 1) & 31); + const int i3 = (int)(((u1 & 1) << 4) | (u2 >> 4)); + const int i4 = (int)(((u2 & 15) << 1) | (u3 >> 7)); + const int i5 = (int)((u3 >> 2) & 31); + const int i6 = (int)(((u3 & 3) << 3) | (u4 >> 5)); + const int i7 = (int)(u4 & 31); + f64[j] = palette[i0]; + if (j + 1 < nI) + f64[j + 1] = palette[i1]; + if (j + 2 < nI) + f64[j + 2] = palette[i2]; + if (j + 3 < nI) + f64[j + 3] = palette[i3]; + if (j + 4 < nI) + f64[j + 4] = palette[i4]; + if (j + 5 < nI) + f64[j + 5] = palette[i5]; + if (j + 6 < nI) + f64[j + 6] = palette[i6]; + if (j + 7 < nI) + f64[j + 7] = palette[i7]; + ui1 += 5; + } + } + } parallel_endfor + } else if (qbits == 6) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 64 + number_in_blocks / 4 * 3) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const double* const palette = (double*)ui0; + const uint8_t* ui1 = ui0 + element_size * 64; + double* const f64 = (double*)u80; + int j; + if (nI % 4 == 0) + { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f64[j] = palette[i0]; + f64[j + 1] = palette[i1]; + f64[j + 2] = palette[i2]; + f64[j + 3] = palette[i3]; + ui1 += 3; + } + } else { + for (j = 0; j < nI; j += 4) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const int i0 = (int)(u0 >> 2); + const int i1 = (int)(((u0 & 3) << 4) | (u1 >> 4)); + const int i2 = (int)(((u1 & 15) << 2) | (u2 >> 6)); + const int i3 = (int)(u2 & 63); + f64[j] = palette[i0]; + if (j + 1 < nI) + f64[j + 1] = palette[i1]; + if (j + 2 < nI) + f64[j + 2] = palette[i2]; + if (j + 3 < nI) + f64[j + 3] = palette[i3]; + ui1 += 3; + } + } + } parallel_endfor + } else if (qbits == 7) { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 128 + number_in_blocks / 8 * 7) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const double* const palette = (double*)ui0; + const uint8_t* ui1 = ui0 + element_size * 128; + double* const f64 = (double*)u80; + int j; + if (nI % 8 == 0) + { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f64[j] = palette[i0]; + f64[j + 1] = palette[i1]; + f64[j + 2] = palette[i2]; + f64[j + 3] = palette[i3]; + f64[j + 4] = palette[i4]; + f64[j + 5] = palette[i5]; + f64[j + 6] = palette[i6]; + f64[j + 7] = palette[i7]; + ui1 += 7; + } + } else { + for (j = 0; j < nI; j += 8) + { + const uint8_t u0 = ui1[0]; + const uint8_t u1 = ui1[1]; + const uint8_t u2 = ui1[2]; + const uint8_t u3 = ui1[3]; + const uint8_t u4 = ui1[4]; + const uint8_t u5 = ui1[5]; + const uint8_t u6 = ui1[6]; + const int i0 = (int)(u0 >> 1); + const int i1 = (int)(((u0 & 1) << 6) | (u1 >> 2)); + const int i2 = (int)(((u1 & 3) << 5) | (u2 >> 3)); + const int i3 = (int)(((u2 & 7) << 4) | (u3 >> 4)); + const int i4 = (int)(((u3 & 15) << 3) | (u4 >> 5)); + const int i5 = (int)(((u4 & 31) << 2) | (u5 >> 6)); + const int i6 = (int)(((u5 & 63) << 1) | (u6 >> 7)); + const int i7 = (int)(u6 & 127); + f64[j] = palette[i0]; + if (j + 1 < nI) + f64[j + 1] = palette[i1]; + if (j + 2 < nI) + f64[j + 2] = palette[i2]; + if (j + 3 < nI) + f64[j + 3] = palette[i3]; + if (j + 4 < nI) + f64[j + 4] = palette[i4]; + if (j + 5 < nI) + f64[j + 5] = palette[i5]; + if (j + 6 < nI) + f64[j + 6] = palette[i6]; + if (j + 7 < nI) + f64[j + 7] = palette[i7]; + ui1 += 7; + } + } + } parallel_endfor + } else { + parallel_for(i, num_blocks) { + const int nI = ccv_min(number_in_blocks, output_length - i * number_in_blocks); + const uint8_t* const ui0 = ui + (element_size * 256 + number_in_blocks) * i; + uint8_t* const u80 = u8 + element_size * number_in_blocks * i; + const double* const palette = (double*)ui0; + const uint8_t* ui1 = ui0 + element_size * 256; + double* const f64 = (double*)u80; + int j; + for (j = 0; j < nI; j++) + { + const uint8_t u0 = *ui1; + f64[j] = palette[u0]; + ++ui1; + } + } parallel_endfor + } + } +} diff --git a/lib/nnc/makefile b/lib/nnc/makefile index 87ffd35e1..978d745d7 100644 --- a/lib/nnc/makefile +++ b/lib/nnc/makefile @@ -3,7 +3,7 @@ include ../config.mk CFLAGS := -O3 -Wall -I"../" $(CFLAGS) NVFLAGS := -O3 $(NVFLAGS) -SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c +SRCS := ccv_nnc_cmd.c ccv_nnc_tensor.c ccv_nnc_tensor_io.c ccv_nnc_stream.c ccv_nnc_micro.c ccv_nnc_micro_core.c ccv_nnc_micro_interpret.c ccv_nnc_micro_simplify.c ccv_nnc_graph.c ccv_nnc_symbolic_graph.c ccv_nnc_symbolic_graph_io.c ccv_nnc_symbolic_graph_compile.c ccv_nnc_symbolic_graph_backward.c ccv_nnc_symbolic_graph_while.c ccv_nnc_graph_while.c ccv_nnc_tensor_tape.c ccv_nnc_symbolic_graph_case_of.c ccv_nnc_graph_case_of.c ccv_nnc_symbolic_graph_minimize.c ccv_nnc_symbolic_graph_parallel.c ccv_nnc_symbolic_graph_simplify.c ccv_nnc_symbolic_graph_memory_compression.c ccv_nnc_symbolic_graph_memory_reduction.c ccv_nnc_graph_run.c ccv_nnc_xpu_alloc.c ccv_nnc_dynamic_graph.c ccv_nnc_dynamic_graph_alloc.c ccv_nnc_dynamic_graph_backward.c ccv_nnc_dynamic_graph_apply_gradients.c ccv_nnc_dynamic_graph_minimize.c ccv_nnc_dynamic_graph_evaluate.c ccv_cnnp_dataframe.c ccv_cnnp_dataframe_core.c ccv_cnnp_dataframe_addons.c ccv_cnnp_dataframe_csv.c ccv_cnnp_model.c ccv_cnnp_model_io.c ccv_cnnp_model_core.c ccv_cnnp_model_addons.c co.c ccv_nnc_tensor_palettize.c SRC_OBJS := $(patsubst %.c,%.o,$(SRCS)) diff --git a/test/unit/nnc/makefile b/test/unit/nnc/makefile index 4eb8b2359..982b1d460 100644 --- a/test/unit/nnc/makefile +++ b/test/unit/nnc/makefile @@ -3,7 +3,7 @@ include ../../../lib/config.mk export LSAN_OPTIONS=suppressions=known-leaks.txt LDFLAGS := -L"../../../lib" -lccv $(LDFLAGS) CFLAGS := -O3 -Wall -I"../../../lib" -I"../../" $(CFLAGS) -TARGETS = tfb.tests tensor.tests forward.tests backward.tests gradient.tests graph.tests winograd.tests transform.tests symbolic.graph.tests autograd.tests autograd.vector.tests while.tests tape.tests while.backward.tests case_of.tests case_of.backward.tests numa.tests tensor.bind.tests broadcast.tests reduce.tests batch.norm.tests layer.norm.tests dropout.tests crossentropy.tests dynamic.graph.tests simplify.tests symbolic.graph.compile.tests rand.tests graph.io.tests cnnp.core.tests minimize.tests custom.tests parallel.tests dataframe.tests dataframe.addons.tests compression.tests gemm.tests index.tests swish.tests upsample.tests smooth_l1.tests roi_align.tests nms.tests compare.tests concat.tests cblas.tests micro.tests loss.tests histogram.tests group.norm.tests gelu.tests attention.tests +TARGETS = tfb.tests tensor.tests forward.tests backward.tests gradient.tests graph.tests winograd.tests transform.tests symbolic.graph.tests autograd.tests autograd.vector.tests while.tests tape.tests while.backward.tests case_of.tests case_of.backward.tests numa.tests tensor.bind.tests broadcast.tests reduce.tests batch.norm.tests layer.norm.tests dropout.tests crossentropy.tests dynamic.graph.tests simplify.tests symbolic.graph.compile.tests rand.tests graph.io.tests cnnp.core.tests minimize.tests custom.tests parallel.tests dataframe.tests dataframe.addons.tests compression.tests gemm.tests index.tests swish.tests upsample.tests smooth_l1.tests roi_align.tests nms.tests compare.tests concat.tests cblas.tests micro.tests loss.tests histogram.tests group.norm.tests gelu.tests attention.tests palettize.tests ifeq ($(shell uname 2>/dev/null || echo Unknown),Darwin) LDFLAGS += -Wl,-U,___test_case_setup,-U,___test_case_teardown diff --git a/test/unit/nnc/palettize.tests.c b/test/unit/nnc/palettize.tests.c new file mode 100644 index 000000000..495b98188 --- /dev/null +++ b/test/unit/nnc/palettize.tests.c @@ -0,0 +1,610 @@ +#include "case.h" +#include "ccv_case.h" +#include "ccv_nnc_case.h" +#include +#include +#include +#include "3rdparty/dsfmt/dSFMT.h" + +TEST_SETUP() +{ + ccv_nnc_init(); +} + +TEST_CASE("quantize double to 4-bit and dequantize on CPU losslessly") +{ + double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + double* const values = ccmalloc(sizeof(double) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944); + REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 4-bit and dequantize on CPU losslessly") +{ + float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + float* const values = ccmalloc(sizeof(float) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 2); + REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); + float* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 4-bit and dequantize on CPU losslessly") +{ + float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + uint16_t lut[16]; + ccv_float_to_half_precision(lut_f32, lut, 16); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 4, 128, compressed, 1420 + 2944 / 4); + REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 5-bit and dequantize on CPU losslessly") +{ + double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + double* const values = ccmalloc(sizeof(double) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 8); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 5-bit and dequantize on CPU losslessly") +{ + float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + float* const values = ccmalloc(sizeof(float) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 4); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 5-bit and dequantize on CPU losslessly") +{ + float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + uint16_t lut[32]; + ccv_float_to_half_precision(lut_f32, lut, 32); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); + int i; + for (i = 0; i < 2839; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 5, 128, compressed, 1775 + 23 * 32 * 2); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2839); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 6-bit and dequantize on CPU losslessly") +{ + double lut[64]; + int i; + for (i = 0; i < 64; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 8); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 6-bit and dequantize on CPU losslessly") +{ + float lut[64]; + int i; + for (i = 0; i < 64; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 4); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2839); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 6-bit and dequantize on CPU losslessly") +{ + float lut_f32[64]; + int i; + for (i = 0; i < 64; i++) + lut_f32[i] = (float)i; + uint16_t lut[64]; + ccv_float_to_half_precision(lut_f32, lut, 64); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 6, 512, compressed, 2130 + 6 * 64 * 2); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 7-bit and dequantize on CPU losslessly") +{ + double lut[128]; + int i; + for (i = 0; i < 128; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 8); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 7-bit and dequantize on CPU losslessly") +{ + float lut[128]; + int i; + for (i = 0; i < 128; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 4); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2839); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 7-bit and dequantize on CPU losslessly") +{ + float lut_f32[128]; + int i; + for (i = 0; i < 128; i++) + lut_f32[i] = (float)i; + uint16_t lut[128]; + ccv_float_to_half_precision(lut_f32, lut, 128); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 7, 512, compressed, 2485 + 6 * 128 * 2); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2839); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 8-bit and dequantize on CPU losslessly") +{ + double lut[256]; + int i; + for (i = 0; i < 256; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 8); + REQUIRE_EQ(output_size, 2839 + 3 * 256 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2839); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839); + REQUIRE_ARRAY_EQ(double, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 8-bit and dequantize on CPU losslessly") +{ + float lut[256]; + int i; + for (i = 0; i < 256; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 4); + REQUIRE_EQ(output_size, 2839 + 3 * 256 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2839); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839); + REQUIRE_ARRAY_EQ(float, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 8-bit and dequantize on CPU losslessly") +{ + float lut_f32[256]; + int i; + for (i = 0; i < 256; i++) + lut_f32[i] = (float)i; + uint16_t lut[256]; + ccv_float_to_half_precision(lut_f32, lut, 256); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2839); + for (i = 0; i < 2839; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2839 + 3 * 256 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2839, 8, 1280, compressed, 2839 + 3 * 256 * 2); + REQUIRE_EQ(output_size, 2839 + 3 * 256 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2839); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2839); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2839, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 4-bit and dequantize on CPU losslessly, fast path") +{ + double lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + double* const values = ccmalloc(sizeof(double) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944); + REQUIRE_EQ(output_size, 1420 + 2944, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 4-bit and dequantize on CPU losslessly, fast path") +{ + float lut[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + float* const values = ccmalloc(sizeof(float) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 2); + REQUIRE_EQ(output_size, 1420 + 2944 / 2, "output size should match"); + float* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 4-bit and dequantize on CPU losslessly, fast path") +{ + float lut_f32[16] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0}; + uint16_t lut[16]; + ccv_float_to_half_precision(lut_f32, lut, 16); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 16]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1420 + 2944 / 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 4, 128, compressed, 1420 + 2944 / 4); + REQUIRE_EQ(output_size, 1420 + 2944 / 4, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 4, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 5-bit and dequantize on CPU losslessly, fast path") +{ + double lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + double* const values = ccmalloc(sizeof(double) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 8); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 5-bit and dequantize on CPU losslessly, fast path") +{ + float lut[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + float* const values = ccmalloc(sizeof(float) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 4); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 5-bit and dequantize on CPU losslessly, fast path") +{ + float lut_f32[32] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0, -14.0, -15.0}; + uint16_t lut[32]; + ccv_float_to_half_precision(lut_f32, lut, 32); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); + int i; + for (i = 0; i < 2840; i++) + values[i] = lut[i % 32]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (1775 + 23 * 32 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 5, 128, compressed, 1775 + 23 * 32 * 2); + REQUIRE_EQ(output_size, 1775 + 23 * 32 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 5, 128, output_values, 2840); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 6-bit and dequantize on CPU losslessly, fast path") +{ + double lut[64]; + int i; + for (i = 0; i < 64; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 8); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 6-bit and dequantize on CPU losslessly, fast path") +{ + float lut[64]; + int i; + for (i = 0; i < 64; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 4); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2840); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 6-bit and dequantize on CPU losslessly, fast path") +{ + float lut_f32[64]; + int i; + for (i = 0; i < 64; i++) + lut_f32[i] = (float)i; + uint16_t lut[64]; + ccv_float_to_half_precision(lut_f32, lut, 64); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 64]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2130 + 6 * 64 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 6, 512, compressed, 2130 + 6 * 64 * 2); + REQUIRE_EQ(output_size, 2130 + 6 * 64 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 6, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 7-bit and dequantize on CPU losslessly, fast path") +{ + double lut[128]; + int i; + for (i = 0; i < 128; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 8); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 7-bit and dequantize on CPU losslessly, fast path") +{ + float lut[128]; + int i; + for (i = 0; i < 128; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 4); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2840); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 7-bit and dequantize on CPU losslessly, fast path") +{ + float lut_f32[128]; + int i; + for (i = 0; i < 128; i++) + lut_f32[i] = (float)i; + uint16_t lut[128]; + ccv_float_to_half_precision(lut_f32, lut, 128); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 128]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2485 + 6 * 128 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 7, 512, compressed, 2485 + 6 * 128 * 2); + REQUIRE_EQ(output_size, 2485 + 6 * 128 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 7, 512, output_values, 2840); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize double to 8-bit and dequantize on CPU losslessly, fast path") +{ + double lut[256]; + int i; + for (i = 0; i < 256; i++) + lut[i] = (double)i; + double* const values = ccmalloc(sizeof(double) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 8)); + const size_t output_size = ccv_nnc_palettize(values, CCV_64F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 8); + REQUIRE_EQ(output_size, 2840 + 3 * 256 * 8, "output size should match"); + double* const output_values = ccmalloc(sizeof(double) * 2840); + ccv_nnc_depalettize(compressed, CCV_64F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840); + REQUIRE_ARRAY_EQ(double, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize float to 8-bit and dequantize on CPU losslessly, fast path") +{ + float lut[256]; + int i; + for (i = 0; i < 256; i++) + lut[i] = (float)i; + float* const values = ccmalloc(sizeof(float) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 4)); + const size_t output_size = ccv_nnc_palettize(values, CCV_32F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 4); + REQUIRE_EQ(output_size, 2840 + 3 * 256 * 4, "output size should match"); + float* const output_values = ccmalloc(sizeof(float) * 2840); + ccv_nnc_depalettize(compressed, CCV_32F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840); + REQUIRE_ARRAY_EQ(float, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +TEST_CASE("quantize half-precision to 8-bit and dequantize on CPU losslessly, fast path") +{ + float lut_f32[256]; + int i; + for (i = 0; i < 256; i++) + lut_f32[i] = (float)i; + uint16_t lut[256]; + ccv_float_to_half_precision(lut_f32, lut, 256); + uint16_t* const values = ccmalloc(sizeof(uint16_t) * 2840); + for (i = 0; i < 2840; i++) + values[i] = lut[i % 256]; + uint8_t* compressed = ccmalloc(sizeof(uint8_t) * (2840 + 3 * 256 * 2)); + const size_t output_size = ccv_nnc_palettize(values, CCV_16F, CCV_TENSOR_CPU_MEMORY, 2840, 8, 1280, compressed, 2840 + 3 * 256 * 2); + REQUIRE_EQ(output_size, 2840 + 3 * 256 * 2, "output size should match"); + uint16_t* const output_values = ccmalloc(sizeof(uint16_t) * 2840); + ccv_nnc_depalettize(compressed, CCV_16F, CCV_TENSOR_CPU_MEMORY, output_size, 8, 1280, output_values, 2840); + REQUIRE_ARRAY_EQ(uint16_t, values, output_values, 2840, "should be lossless"); + ccfree(values); + ccfree(output_values); + ccfree(compressed); +} + +#include "case_main.h"