-
Notifications
You must be signed in to change notification settings - Fork 7
/
zipf.h
182 lines (163 loc) · 5.29 KB
/
zipf.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#pragma once
#include "common.h"
#include "util.h"
#include <cstdlib>
#include <cstring>
#include <cmath>
#include <cstdio>
struct zipf_gen_state {
uint64_t n; // number of items (input)
double theta; // skewness (input) in (0, 1); or, 0 = uniform, 1 = always zero
double alpha; // only depends on theta
double thres; // only depends on theta
uint64_t last_n; // last n used to calculate the following
double dbl_n;
double zetan;
double eta;
// unsigned short rand_state[3]; // prng state
uint64_t rand_state;
};
static double pow_approx(double a, double b) {
// from
// http://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/
// calculate approximation with fraction of the exponent
int e = (int)b;
union {
double d;
int x[2];
} u = {a};
u.x[1] = (int)((b - (double)e) * (double)(u.x[1] - 1072632447) + 1072632447.);
u.x[0] = 0;
// exponentiation by squaring with the exponent's integer part
// double r = u.d makes everything much slower, not sure why
// TODO: use popcount?
double r = 1.;
while (e) {
if (e & 1) r *= a;
a *= a;
e >>= 1;
}
return r * u.d;
}
static void zipf_init(struct zipf_gen_state* state, uint64_t n, double theta,
uint64_t rand_seed) {
assert(n > 0);
if (theta > 0.992 && theta < 1)
fprintf(stderr, "theta > 0.992 will be inaccurate due to approximation\n");
if (theta >= 1. && theta < 40.) {
fprintf(stderr, "theta in [1., 40.) is not supported\n");
assert(false);
}
assert(theta == -1. || (theta >= 0. && theta < 1.) || theta >= 40.);
assert(rand_seed < (1UL << 48));
memset(state, 0, sizeof(struct zipf_gen_state));
state->n = n;
state->theta = theta;
if (theta == -1.)
rand_seed = rand_seed % n;
else if (theta > 0. && theta < 1.) {
state->alpha = 1. / (1. - theta);
state->thres = 1. + pow_approx(0.5, theta);
} else {
state->alpha = 0.; // unused
state->thres = 0.; // unused
}
state->last_n = 0;
state->zetan = 0.;
// state->rand_state[0] = (unsigned short)(rand_seed >> 0);
// state->rand_state[1] = (unsigned short)(rand_seed >> 16);
// state->rand_state[2] = (unsigned short)(rand_seed >> 32);
state->rand_state = rand_seed;
}
static void zipf_init_copy(struct zipf_gen_state* state,
const struct zipf_gen_state* src_state,
uint64_t rand_seed) {
assert(rand_seed < (1UL << 48));
memcpy(state, src_state, sizeof(struct zipf_gen_state));
// state->rand_state[0] = (unsigned short)(rand_seed >> 0);
// state->rand_state[1] = (unsigned short)(rand_seed >> 16);
// state->rand_state[2] = (unsigned short)(rand_seed >> 32);
state->rand_state = rand_seed;
}
static void zipf_change_n(struct zipf_gen_state* state, uint64_t n) {
state->n = n;
}
static double zeta(uint64_t last_n, double last_sum, uint64_t n, double theta) {
if (last_n > n) {
last_n = 0;
last_sum = 0.;
}
while (last_n < n) {
last_sum += 1. / pow_approx((double)last_n + 1., theta);
last_n++;
}
return last_sum;
}
static uint64_t zipf_next(struct zipf_gen_state* state) {
if (state->last_n != state->n) {
if (state->theta > 0. && state->theta < 1.) {
state->zetan = zeta(state->last_n, state->zetan, state->n, state->theta);
state->eta = (1. - pow_approx(2. / (double)state->n, 1. - state->theta)) /
(1. - zeta(0, 0., 2, state->theta) / state->zetan);
}
state->last_n = state->n;
state->dbl_n = (double)state->n;
}
if (state->theta == -1.) {
uint64_t v = state->rand_state;
if (++state->rand_state >= state->n) state->rand_state = 0;
return v;
} else if (state->theta == 0.) {
double u = fast_rand_d(&state->rand_state);
return (uint64_t)(state->dbl_n * u);
} else if (state->theta >= 40.) {
return 0UL;
} else {
// from J. Gray et al. Quickly generating billion-record synthetic
// databases. In SIGMOD, 1994.
// double u = erand48(state->rand_state);
double u = fast_rand_d(&state->rand_state);
double uz = u * state->zetan;
if (uz < 1.)
return 0UL;
else if (uz < state->thres)
return 1UL;
else
return (uint64_t)(state->dbl_n *
pow_approx(state->eta * (u - 1.) + 1., state->alpha));
}
}
static double zipf_prob(const struct zipf_gen_state* state, uint64_t i) {
// this must be called after at least one zipf_next() invocation
if (state->theta == -1.)
return 1.;
else if (state->theta == 0.)
return 1.;
else if (state->theta >= 40.) {
if (i == 0)
return 1.;
else
return 0.;
} else {
return 1. / pow_approx((double)i + 1., state->theta);
}
}
static void test_zipf(double theta) {
double zetan = 0.;
const uint64_t n = 1000000UL;
uint64_t i;
for (i = 0; i < n; i++) zetan += 1. / pow((double)i + 1., theta);
struct zipf_gen_state state;
if (theta < 1. || theta >= 40.) zipf_init(&state, n, theta, 0);
uint64_t num_key0 = 0;
const uint64_t num_samples = 10000000UL;
if (theta < 1. || theta >= 40.) {
for (i = 0; i < num_samples; i++)
if (zipf_next(&state) == 0) num_key0++;
}
printf("theta = %lf; using pow(): %.10lf", theta, 1. / zetan);
if (theta < 1. || theta >= 40.)
printf(", using approx-pow(): %.10lf",
(double)num_key0 / (double)num_samples);
printf("\n");
}