-
Notifications
You must be signed in to change notification settings - Fork 456
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
implement rolling hyper-log-log algorithm (#8068)
## Problem See #7466 ## Summary of changes Implement algorithm descried in https://hal.science/hal-00465313/document Now new GUC is added: `neon.wss_max_duration` which specifies size of sliding window (in seconds). Default value is 1 hour. It is possible to request estimation of working set sizes (within this window using new function `approximate_working_set_size_seconds`. Old function `approximate_working_set_size` is preserved for backward compatibility. But its scope is also limited by `neon.wss_max_duration`. Version of Neon extension is changed to 1.4 ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik <[email protected]> Co-authored-by: Matthias van de Meent <[email protected]>
- Loading branch information
Showing
8 changed files
with
363 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
/*------------------------------------------------------------------------- | ||
* | ||
* hll.c | ||
* Sliding HyperLogLog cardinality estimator | ||
* | ||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group | ||
* | ||
* Implements https://hal.science/hal-00465313/document | ||
* | ||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally | ||
* suited to estimating the cardinality of very large sets; in particular, we | ||
* have not attempted to further optimize the implementation as described in | ||
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic | ||
* Engineering of a State of The Art Cardinality Estimation Algorithm". | ||
* | ||
* A sparse representation of HyperLogLog state is used, with fixed space | ||
* overhead. | ||
* | ||
* The copyright terms of Ohno's original version (the MIT license) follow. | ||
* | ||
* IDENTIFICATION | ||
* src/backend/lib/hyperloglog.c | ||
* | ||
*------------------------------------------------------------------------- | ||
*/ | ||
|
||
/* | ||
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the 'Software'), to | ||
* deal in the Software without restriction, including without limitation the | ||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
* sell copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
* IN THE SOFTWARE. | ||
*/ | ||
|
||
#include <math.h> | ||
|
||
#include "postgres.h" | ||
#include "funcapi.h" | ||
#include "port/pg_bitutils.h" | ||
#include "utils/timestamp.h" | ||
#include "hll.h" | ||
|
||
|
||
#define POW_2_32 (4294967296.0) | ||
#define NEG_POW_2_32 (-4294967296.0) | ||
|
||
#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) | ||
|
||
/* | ||
* Worker for addHyperLogLog(). | ||
* | ||
* Calculates the position of the first set bit in first b bits of x argument | ||
* starting from the first, reading from most significant to least significant | ||
* bits. | ||
* | ||
* Example (when considering fist 10 bits of x): | ||
* | ||
* rho(x = 0b1000000000) returns 1 | ||
* rho(x = 0b0010000000) returns 3 | ||
* rho(x = 0b0000000000) returns b + 1 | ||
* | ||
* "The binary address determined by the first b bits of x" | ||
* | ||
* Return value "j" used to index bit pattern to watch. | ||
*/ | ||
static inline uint8 | ||
rho(uint32 x, uint8 b) | ||
{ | ||
uint8 j = 1; | ||
|
||
if (x == 0) | ||
return b + 1; | ||
|
||
j = 32 - pg_leftmost_one_pos32(x); | ||
|
||
if (j > b) | ||
return b + 1; | ||
|
||
return j; | ||
} | ||
|
||
/* | ||
* Initialize HyperLogLog track state | ||
*/ | ||
void | ||
initSHLL(HyperLogLogState *cState) | ||
{ | ||
memset(cState->regs, 0, sizeof(cState->regs)); | ||
} | ||
|
||
/* | ||
* Adds element to the estimator, from caller-supplied hash. | ||
* | ||
* It is critical that the hash value passed be an actual hash value, typically | ||
* generated using hash_any(). The algorithm relies on a specific bit-pattern | ||
* observable in conjunction with stochastic averaging. There must be a | ||
* uniform distribution of bits in hash values for each distinct original value | ||
* observed. | ||
*/ | ||
void | ||
addSHLL(HyperLogLogState *cState, uint32 hash) | ||
{ | ||
uint8 count; | ||
uint32 index; | ||
size_t i; | ||
size_t j; | ||
|
||
TimestampTz now = GetCurrentTimestamp(); | ||
/* Use the first "k" (registerWidth) bits as a zero based index */ | ||
index = hash >> HLL_C_BITS; | ||
|
||
/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ | ||
count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); | ||
|
||
cState->regs[index][count] = now; | ||
} | ||
|
||
static uint8 | ||
getMaximum(const TimestampTz* reg, TimestampTz since) | ||
{ | ||
uint8 max = 0; | ||
|
||
for (size_t i = 0; i < HLL_C_BITS + 1; i++) | ||
{ | ||
if (reg[i] >= since) | ||
{ | ||
max = i; | ||
} | ||
} | ||
|
||
return max; | ||
} | ||
|
||
|
||
/* | ||
* Estimates cardinality, based on elements added so far | ||
*/ | ||
double | ||
estimateSHLL(HyperLogLogState *cState, time_t duration) | ||
{ | ||
double result; | ||
double sum = 0.0; | ||
size_t i; | ||
uint8 R[HLL_N_REGISTERS]; | ||
/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ | ||
TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; | ||
|
||
for (i = 0; i < HLL_N_REGISTERS; i++) | ||
{ | ||
R[i] = getMaximum(cState->regs[i], since); | ||
sum += 1.0 / pow(2.0, R[i]); | ||
} | ||
|
||
/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ | ||
result = ALPHA_MM / sum; | ||
|
||
if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) | ||
{ | ||
/* Small range correction */ | ||
int zero_count = 0; | ||
|
||
for (i = 0; i < HLL_N_REGISTERS; i++) | ||
{ | ||
zero_count += R[i] == 0; | ||
} | ||
|
||
if (zero_count != 0) | ||
result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / | ||
zero_count); | ||
} | ||
else if (result > (1.0 / 30.0) * POW_2_32) | ||
{ | ||
/* Large range correction */ | ||
result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); | ||
} | ||
|
||
return result; | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
/*------------------------------------------------------------------------- | ||
* | ||
* hll.h | ||
* Sliding HyperLogLog cardinality estimator | ||
* | ||
* Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group | ||
* | ||
* Implements https://hal.science/hal-00465313/document | ||
* | ||
* Based on Hideaki Ohno's C++ implementation. This is probably not ideally | ||
* suited to estimating the cardinality of very large sets; in particular, we | ||
* have not attempted to further optimize the implementation as described in | ||
* the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic | ||
* Engineering of a State of The Art Cardinality Estimation Algorithm". | ||
* | ||
* A sparse representation of HyperLogLog state is used, with fixed space | ||
* overhead. | ||
* | ||
* The copyright terms of Ohno's original version (the MIT license) follow. | ||
* | ||
* IDENTIFICATION | ||
* src/backend/lib/hyperloglog.c | ||
* | ||
*------------------------------------------------------------------------- | ||
*/ | ||
|
||
/* | ||
* Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com> | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the 'Software'), to | ||
* deal in the Software without restriction, including without limitation the | ||
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | ||
* sell copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | ||
* IN THE SOFTWARE. | ||
*/ | ||
|
||
#ifndef HLL_H | ||
#define HLL_H | ||
|
||
#define HLL_BIT_WIDTH 10 | ||
#define HLL_C_BITS (32 - HLL_BIT_WIDTH) | ||
#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) | ||
|
||
/* | ||
* HyperLogLog is an approximate technique for computing the number of distinct | ||
* entries in a set. Importantly, it does this by using a fixed amount of | ||
* memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal | ||
* cardinality estimation algorithm" for more. | ||
* | ||
* Instead of a single counter for every bits register, we have a timestamp | ||
* for every valid number of bits we can encounter. Every time we encounter | ||
* a certain number of bits, we update the timestamp in those registers to | ||
* the current timestamp. | ||
* | ||
* We can query the sketch's stored cardinality for the range of some timestamp | ||
* up to now: For each register, we return the highest bits bucket that has a | ||
* modified timestamp >= the query timestamp. This value is the number of bits | ||
* for this register in the normal HLL calculation. | ||
* | ||
* The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. | ||
* Usage could be halved if we decide to reduce the required time dimension | ||
* precision; as 32 bits in second precision should be enough for statistics. | ||
* However, that is not yet implemented. | ||
*/ | ||
typedef struct HyperLogLogState | ||
{ | ||
TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; | ||
} HyperLogLogState; | ||
|
||
extern void initSHLL(HyperLogLogState *cState); | ||
extern void addSHLL(HyperLogLogState *cState, uint32 hash); | ||
extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); | ||
|
||
#endif |
Oops, something went wrong.