diff --git a/src/containers/run.c b/src/containers/run.c index 0d6472643..1da4fc140 100644 --- a/src/containers/run.c +++ b/src/containers/run.c @@ -636,24 +636,6 @@ void run_container_andnot(const run_container_t *src_1, } } -ALLOW_UNALIGNED -int run_container_to_uint32_array(void *vout, const run_container_t *cont, - uint32_t base) { - int outpos = 0; - uint32_t *out = (uint32_t *)vout; - for (int i = 0; i < cont->n_runs; ++i) { - uint32_t run_start = base + cont->runs[i].value; - uint16_t le = cont->runs[i].length; - for (int j = 0; j <= le; ++j) { - uint32_t val = run_start + j; - memcpy(out + outpos, &val, - sizeof(uint32_t)); // should be compiled as a MOV on x64 - outpos++; - } - } - return outpos; -} - /* * Print this container using printf (useful for debugging). */ @@ -1026,6 +1008,47 @@ static inline int _avx2_run_container_cardinality(const run_container_t *run) { return sum; } +ALLOW_UNALIGNED +int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + if (le < 8) { + for (int j = 0; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } else { + int j = 0; + __m256i run_start_v = _mm256_set1_epi32(run_start); + // [8,8,8,8....] + __m256i inc = _mm256_set1_epi32(8); + // used for generate sequence: + // [0, 1, 2, 3...], [8, 9, 10,...] + __m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + for (j = 0; j + 8 <= le; j += 8) { + __m256i val_v = _mm256_add_epi32(run_start_v, delta); + _mm256_storeu_si256((__m256i *)(out + outpos), val_v); + delta = _mm256_add_epi32(inc, delta); + outpos += 8; + } + for (; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } + } + return outpos; +} + CROARING_UNTARGET_AVX2 /* Get the cardinality of `run'. Requires an actual computation. */ @@ -1055,6 +1078,34 @@ int run_container_cardinality(const run_container_t *run) { return _scalar_run_container_cardinality(run); } } + +int _scalar_run_container_to_uint32_array(void *vout, + const run_container_t *cont, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + for (int j = 0; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } + return outpos; +} + +int run_container_to_uint32_array(void *vout, const run_container_t *cont, + uint32_t base) { + if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) { + return _avx2_run_container_to_uint32_array(vout, cont, base); + } else { + return _scalar_run_container_to_uint32_array(vout, cont, base); + } +} + #else /* Get the cardinality of `run'. Requires an actual computation. */ @@ -1071,6 +1122,25 @@ int run_container_cardinality(const run_container_t *run) { return sum; } + +ALLOW_UNALIGNED +int run_container_to_uint32_array(void *vout, const run_container_t *cont, + uint32_t base) { + int outpos = 0; + uint32_t *out = (uint32_t *)vout; + for (int i = 0; i < cont->n_runs; ++i) { + uint32_t run_start = base + cont->runs[i].value; + uint16_t le = cont->runs[i].length; + for (int j = 0; j <= le; ++j) { + uint32_t val = run_start + j; + memcpy(out + outpos, &val, + sizeof(uint32_t)); // should be compiled as a MOV on x64 + outpos++; + } + } + return outpos; +} + #endif #ifdef __cplusplus