Skip to content

Commit

Permalink
support AVX2 for run_container_to_uint32_array (#642)
Browse files Browse the repository at this point in the history
* support AVX2 for run_container_to_uint32_array

1. support AVX for run_container_to_uint32_array
2. add dense range for run container

baseline
```

 number of values in container = 256
run_container_to_uint32_array(out, Bt, 1234):  3.64 cycles per operation

 number of values in container = 2018
run_container_to_uint32_array(out, Bt, 1234):  3.07 cycles per operation

 number of values in container = 14498
run_container_to_uint32_array(out, Bt, 1234):  3.47 cycles per operation

 number of values in container = 7826
run_container_to_uint32_array(out, Bt, 1234):  0.18 cycles per operation

 number of values in container = 8152
run_container_to_uint32_array(out, Bt, 1234):  0.18 cycles per operation

 number of values in container = 8189
run_container_to_uint32_array(out, Bt, 1234):  0.18 cycles per operation

 number of values in container = 8191
run_container_to_uint32_array(out, Bt, 1234):  0.18 cycles per operation

```

AVX2 version:
```

 number of values in container = 256
run_container_to_uint32_array(out, Bt, 1234):  4.38 cycles per operation

 number of values in container = 2018
run_container_to_uint32_array(out, Bt, 1234):  3.77 cycles per operation

 number of values in container = 14498
run_container_to_uint32_array(out, Bt, 1234):  4.19 cycles per operation

 number of values in container = 7826
run_container_to_uint32_array(out, Bt, 1234):  0.10 cycles per operation

 number of values in container = 8152
run_container_to_uint32_array(out, Bt, 1234):  0.10 cycles per operation

 number of values in container = 8189
run_container_to_uint32_array(out, Bt, 1234):  0.10 cycles per operation

 number of values in container = 8191
run_container_to_uint32_array(out, Bt, 1234):  0.10 cycles per operation

```

SIMD version works well on dense case. However, if the length of each runs is small, a single operation will have an if additional overhead.

* avoid regression when run length is small
  • Loading branch information
stdpain authored Jul 22, 2024
1 parent 065f487 commit e326af3
Showing 1 changed file with 88 additions and 18 deletions.
106 changes: 88 additions & 18 deletions src/containers/run.c
Original file line number Diff line number Diff line change
Expand Up @@ -636,24 +636,6 @@ void run_container_andnot(const run_container_t *src_1,
}
}

ALLOW_UNALIGNED
int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

/*
* Print this container using printf (useful for debugging).
*/
Expand Down Expand Up @@ -1026,6 +1008,47 @@ static inline int _avx2_run_container_cardinality(const run_container_t *run) {
return sum;
}

ALLOW_UNALIGNED
int _avx2_run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;

for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
if (le < 8) {
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
} else {
int j = 0;
__m256i run_start_v = _mm256_set1_epi32(run_start);
// [8,8,8,8....]
__m256i inc = _mm256_set1_epi32(8);
// used for generate sequence:
// [0, 1, 2, 3...], [8, 9, 10,...]
__m256i delta = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
for (j = 0; j + 8 <= le; j += 8) {
__m256i val_v = _mm256_add_epi32(run_start_v, delta);
_mm256_storeu_si256((__m256i *)(out + outpos), val_v);
delta = _mm256_add_epi32(inc, delta);
outpos += 8;
}
for (; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
}
return outpos;
}

CROARING_UNTARGET_AVX2

/* Get the cardinality of `run'. Requires an actual computation. */
Expand Down Expand Up @@ -1055,6 +1078,34 @@ int run_container_cardinality(const run_container_t *run) {
return _scalar_run_container_cardinality(run);
}
}

int _scalar_run_container_to_uint32_array(void *vout,
const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
if (croaring_hardware_support() & ROARING_SUPPORTS_AVX2) {
return _avx2_run_container_to_uint32_array(vout, cont, base);
} else {
return _scalar_run_container_to_uint32_array(vout, cont, base);
}
}

#else

/* Get the cardinality of `run'. Requires an actual computation. */
Expand All @@ -1071,6 +1122,25 @@ int run_container_cardinality(const run_container_t *run) {

return sum;
}

ALLOW_UNALIGNED
int run_container_to_uint32_array(void *vout, const run_container_t *cont,
uint32_t base) {
int outpos = 0;
uint32_t *out = (uint32_t *)vout;
for (int i = 0; i < cont->n_runs; ++i) {
uint32_t run_start = base + cont->runs[i].value;
uint16_t le = cont->runs[i].length;
for (int j = 0; j <= le; ++j) {
uint32_t val = run_start + j;
memcpy(out + outpos, &val,
sizeof(uint32_t)); // should be compiled as a MOV on x64
outpos++;
}
}
return outpos;
}

#endif

#ifdef __cplusplus
Expand Down

0 comments on commit e326af3

Please sign in to comment.