diff --git a/Makefile b/Makefile index 688dc7a..2bf98ed 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ else # Here we expect x64 # Formally speaking, we only need SSE4, at best, but code checks for AVX # since MSVC only allows to check for AVX and nothing finer like just SSE4 -CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow +CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow -Wno-unused-function endif LDFLAGS = -shared LIBNAME=libstreamvbyte.so.0.0.1 diff --git a/src/streamvbyte.c b/src/streamvbyte.c index 1297da6..ea53705 100644 --- a/src/streamvbyte.c +++ b/src/streamvbyte.c @@ -227,7 +227,7 @@ size_t streamvbyte_encode4(__m128i in, uint8_t *outData, uint8_t *outCode) { } size_t streamvbyte_encode_quad( uint32_t *in, uint8_t *outData, uint8_t *outKey) { - __m128i vin = _mm_loadu_si128((__m128i *) in ); + __m128i vin = _mm_stream_load_si128((__m128i *) in ); return streamvbyte_encode4(vin, outData, outKey); } @@ -272,7 +272,8 @@ static inline __m128i _decode_avx(uint32_t key, } static inline void _write_avx(uint32_t *out, __m128i Vec) { - _mm_storeu_si128((__m128i *)out, Vec); + //_mm_storeu_si128((__m128i *)out, Vec); + _mm_stream_si128((__m128i *)out, Vec); } #endif // __AVX__ diff --git a/tests/decode_perf.c b/tests/decode_perf.c deleted file mode 100644 index 3535c17..0000000 --- a/tests/decode_perf.c +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include -#include -#include -#include - -#include "streamvbyte.h" -//static inline void rdtsc(unsigned long long *destination) { -// uint64_t t; - // __asm__ volatile(".byte 0x0f, 0x31" : "=A"(t)); -// *destination = 0; -//} - -int main() { - int N = 500000; - int NTrials = 200; - uint32_t datain[N]; - uint8_t compressedbuffer[N * 5]; - uint32_t recovdata[N]; - - for (int k = 0; k < N; ++k) - datain[k] = rand() >> (31 & rand()); - - size_t compsize = 0; - - compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding - // uint64_t tsc, tsc2; - // rdtsc(&tsc); - struct rusage before; - getrusage(RUSAGE_SELF, &before); - - size_t compsize2; - for (int i = 0; i < NTrials; i++) - compsize2 = streamvbyte_decode(compressedbuffer, recovdata, - N); // decoding (fast) - struct rusage after; - getrusage(RUSAGE_SELF, &after); - - float t = (after.ru_utime.tv_usec-before.ru_utime.tv_usec)/1000000.0; - printf("time = %f %f uints/sec\n", t, N*NTrials/t); - - // rdtsc(&tsc2); - //tsc2 -= tsc; - //printf("cycles/quadword %llu\n", (4 * tsc2) / (N * 200)); - // here the result is stored in compressedbuffer using compsize bytes - - - printf("compsize=%zu compsize2 = %zu\n", compsize, compsize2); - // assert(compsize == compsize2); - - int k; - for (k = 0; k < N && datain[k] == recovdata[k]; k++) - ; - - if(k < N) - printf("mismatch at %d before=%d after=%d\n", k, datain[k], recovdata[k]); - - assert( k >= N ); - - // free(datain); - //free(compressedbuffer); - //free(recovdata); - printf("Compressed %d integers down to %d bytes.\n", N, (int)compsize); - return 0; -} diff --git a/tests/perf.c b/tests/perf.c index 59ea6d8..9a59ae3 100644 --- a/tests/perf.c +++ b/tests/perf.c @@ -5,15 +5,37 @@ #include #include "streamvbyte.h" -//static inline void rdtsc(unsigned long long *destination) { -// uint64_t t; - // __asm__ volatile(".byte 0x0f, 0x31" : "=A"(t)); -// *destination = 0; -//} + +static inline uint64_t rdtsc(void) { + uint64_t t = 0; +#ifdef __AVX__ + __asm__ volatile(".byte 0x0f, 0x31" : "=A"(t)); +#endif + return t; +} + +typedef struct { + struct rusage usg; + uint64_t tsc; +} timepoint; + +void gettime(timepoint *t) { + t->tsc = rdtsc(); + getrusage(RUSAGE_SELF, &(t->usg)); +} + +void print_duration(timepoint *begin, timepoint *end, int NTotal, char *label) { + float t = (end->usg.ru_utime.tv_sec - begin->usg.ru_utime.tv_sec) + + (end->usg.ru_utime.tv_usec - begin->usg.ru_utime.tv_usec)/1000000.0; + printf("%s time = %.4f %.0f uints/sec\n", label, t, NTotal/t); + + uint64_t cycles = end->tsc - begin->tsc; + printf("%s cycles/quadword %0.2f\n", label, (4.0 * cycles) / NTotal); +} int main() { int N = 500000; - int NTrials = 100; + int NTrials = 200; uint32_t datain[N]; uint8_t compressedbuffer[N * 5]; uint32_t recovdata[N]; @@ -23,26 +45,27 @@ int main() { size_t compsize = 0; - // uint64_t tsc, tsc2; - // rdtsc(&tsc); - struct rusage before; - getrusage(RUSAGE_SELF, &before); + timepoint encode_start, encode_end; + gettime(&encode_start); for (int i = 0; i < NTrials; i++) compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding - struct rusage after; - getrusage(RUSAGE_SELF, &after); + gettime(&encode_end); + print_duration( &encode_start, &encode_end, N*NTrials, "encoding"); + + size_t compsize2; - float t = (after.ru_utime.tv_usec-before.ru_utime.tv_usec)/1000000.0; - printf("time = %f %f uints/sec\n", t, N*NTrials/t); - // rdtsc(&tsc2); - //tsc2 -= tsc; - //printf("cycles/quadword %llu\n", (4 * tsc2) / (N * 200)); - // here the result is stored in compressedbuffer using compsize bytes - size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata, + timepoint decode_start, decode_end; + gettime(&decode_start); + + for (int i = 0; i < NTrials; i++) + compsize2 = streamvbyte_decode(compressedbuffer, recovdata, N); // decoding (fast) + gettime(&decode_end); + print_duration( &decode_start, &decode_end, N*NTrials, "decoding"); + printf("compsize=%zu compsize2 = %zu\n", compsize, compsize2); // assert(compsize == compsize2); @@ -55,9 +78,6 @@ int main() { assert( k >= N ); - // free(datain); - //free(compressedbuffer); - //free(recovdata); printf("Compressed %d integers down to %d bytes.\n", N, (int)compsize); return 0; }