Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sse non-temporal loads/stores #11

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ else
# Here we expect x64
# Formally speaking, we only need SSE4, at best, but code checks for AVX
# since MSVC only allows to check for AVX and nothing finer like just SSE4
CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow
CFLAGS = -fPIC -march=native -std=c99 -O3 -Wall -Wextra -pedantic -Wshadow -Wno-unused-function
endif
LDFLAGS = -shared
LIBNAME=libstreamvbyte.so.0.0.1
Expand Down
5 changes: 3 additions & 2 deletions src/streamvbyte.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ size_t streamvbyte_encode4(__m128i in, uint8_t *outData, uint8_t *outCode) {
}

size_t streamvbyte_encode_quad( uint32_t *in, uint8_t *outData, uint8_t *outKey) {
__m128i vin = _mm_loadu_si128((__m128i *) in );
__m128i vin = _mm_stream_load_si128((__m128i *) in );
return streamvbyte_encode4(vin, outData, outKey);
}

Expand Down Expand Up @@ -272,7 +272,8 @@ static inline __m128i _decode_avx(uint32_t key,
}

static inline void _write_avx(uint32_t *out, __m128i Vec) {
_mm_storeu_si128((__m128i *)out, Vec);
//_mm_storeu_si128((__m128i *)out, Vec);
_mm_stream_si128((__m128i *)out, Vec);
}

#endif // __AVX__
Expand Down
65 changes: 0 additions & 65 deletions tests/decode_perf.c

This file was deleted.

64 changes: 42 additions & 22 deletions tests/perf.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,37 @@
#include <time.h>

#include "streamvbyte.h"
//static inline void rdtsc(unsigned long long *destination) {
// uint64_t t;
// __asm__ volatile(".byte 0x0f, 0x31" : "=A"(t));
// *destination = 0;
//}

static inline uint64_t rdtsc(void) {
uint64_t t = 0;
#ifdef __AVX__
__asm__ volatile(".byte 0x0f, 0x31" : "=A"(t));
#endif
return t;
}

typedef struct {
struct rusage usg;
uint64_t tsc;
} timepoint;

void gettime(timepoint *t) {
t->tsc = rdtsc();
getrusage(RUSAGE_SELF, &(t->usg));
}

void print_duration(timepoint *begin, timepoint *end, int NTotal, char *label) {
float t = (end->usg.ru_utime.tv_sec - begin->usg.ru_utime.tv_sec) +
(end->usg.ru_utime.tv_usec - begin->usg.ru_utime.tv_usec)/1000000.0;
printf("%s time = %.4f %.0f uints/sec\n", label, t, NTotal/t);

uint64_t cycles = end->tsc - begin->tsc;
printf("%s cycles/quadword %0.2f\n", label, (4.0 * cycles) / NTotal);
}

int main() {
int N = 500000;
int NTrials = 100;
int NTrials = 200;
uint32_t datain[N];
uint8_t compressedbuffer[N * 5];
uint32_t recovdata[N];
Expand All @@ -23,26 +45,27 @@ int main() {

size_t compsize = 0;

// uint64_t tsc, tsc2;
// rdtsc(&tsc);
struct rusage before;
getrusage(RUSAGE_SELF, &before);
timepoint encode_start, encode_end;
gettime(&encode_start);

for (int i = 0; i < NTrials; i++)
compsize = streamvbyte_encode(datain, N, compressedbuffer); // encoding

struct rusage after;
getrusage(RUSAGE_SELF, &after);
gettime(&encode_end);
print_duration( &encode_start, &encode_end, N*NTrials, "encoding");

size_t compsize2;

float t = (after.ru_utime.tv_usec-before.ru_utime.tv_usec)/1000000.0;
printf("time = %f %f uints/sec\n", t, N*NTrials/t);
// rdtsc(&tsc2);
//tsc2 -= tsc;
//printf("cycles/quadword %llu\n", (4 * tsc2) / (N * 200));
// here the result is stored in compressedbuffer using compsize bytes
size_t compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
timepoint decode_start, decode_end;
gettime(&decode_start);

for (int i = 0; i < NTrials; i++)
compsize2 = streamvbyte_decode(compressedbuffer, recovdata,
N); // decoding (fast)

gettime(&decode_end);
print_duration( &decode_start, &decode_end, N*NTrials, "decoding");

printf("compsize=%zu compsize2 = %zu\n", compsize, compsize2);
// assert(compsize == compsize2);

Expand All @@ -55,9 +78,6 @@ int main() {

assert( k >= N );

// free(datain);
//free(compressedbuffer);
//free(recovdata);
printf("Compressed %d integers down to %d bytes.\n", N, (int)compsize);
return 0;
}