forked from shenango/silodb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
base_txn_btree.h
495 lines (443 loc) · 17.3 KB
/
base_txn_btree.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
#ifndef _NDB_BASE_TXN_BTREE_H_
#define _NDB_BASE_TXN_BTREE_H_
#include "btree_choice.h"
#include "txn.h"
#include "lockguard.h"
#include "util.h"
#include "ndb_type_traits.h"
#include <string>
#include <map>
#include <type_traits>
#include <memory>
// each Transaction implementation should specialize this for special
// behavior- the default implementation is just nops
template <template <typename> class Transaction>
struct base_txn_btree_handler {
static inline void on_construct() {} // called when initializing
static const bool has_background_task = false;
};
template <template <typename> class Transaction, typename P>
class base_txn_btree {
public:
typedef transaction_base::tid_t tid_t;
typedef transaction_base::size_type size_type;
typedef transaction_base::string_type string_type;
typedef concurrent_btree::string_type keystring_type;
base_txn_btree(size_type value_size_hint = 128,
bool mostly_append = false,
const std::string &name = "<unknown>")
: value_size_hint(value_size_hint),
name(name),
been_destructed(false)
{
base_txn_btree_handler<Transaction>::on_construct();
}
~base_txn_btree()
{
if (!been_destructed)
unsafe_purge(false);
}
inline size_t
size_estimate() const
{
return underlying_btree.size();
}
inline size_type
get_value_size_hint() const
{
return value_size_hint;
}
inline void
set_value_size_hint(size_type value_size_hint)
{
this->value_size_hint = value_size_hint;
}
inline void print() {
underlying_btree.print();
}
/**
* only call when you are sure there are no concurrent modifications on the
* tree. is neither threadsafe nor transactional
*
* Note that when you call unsafe_purge(), this txn_btree becomes
* completely invalidated and un-usable. Any further operations
* (other than calling the destructor) are undefined
*/
std::map<std::string, uint64_t> unsafe_purge(bool dump_stats = false);
private:
struct purge_tree_walker : public concurrent_btree::tree_walk_callback {
virtual void on_node_begin(const typename concurrent_btree::node_opaque_t *n);
virtual void on_node_success();
virtual void on_node_failure();
#ifdef TXN_BTREE_DUMP_PURGE_STATS
purge_tree_walker()
: purge_stats_nodes(0),
purge_stats_nosuffix_nodes(0) {}
std::map<size_t, size_t> purge_stats_tuple_record_size_counts; // just the record
std::map<size_t, size_t> purge_stats_tuple_alloc_size_counts; // includes overhead
//std::map<size_t, size_t> purge_stats_tuple_chain_counts;
std::vector<uint16_t> purge_stats_nkeys_node;
size_t purge_stats_nodes;
size_t purge_stats_nosuffix_nodes;
std::map<std::string, uint64_t>
dump_stats()
{
std::map<std::string, uint64_t> ret;
size_t v = 0;
for (std::vector<uint16_t>::iterator it = purge_stats_nkeys_node.begin();
it != purge_stats_nkeys_node.end(); ++it)
v += *it;
const double avg_nkeys_node = double(v)/double(purge_stats_nkeys_node.size());
const double avg_fill_factor = avg_nkeys_node/double(concurrent_btree::NKeysPerNode);
std::cerr << "btree node stats" << std::endl;
std::cerr << " avg_nkeys_node: " << avg_nkeys_node << std::endl;
std::cerr << " avg_fill_factor: " << avg_fill_factor << std::endl;
std::cerr << " num_nodes: " << purge_stats_nodes << std::endl;
std::cerr << " num_nosuffix_nodes: " << purge_stats_nosuffix_nodes << std::endl;
std::cerr << "record size stats (nbytes => count)" << std::endl;
for (std::map<size_t, size_t>::iterator it = purge_stats_tuple_record_size_counts.begin();
it != purge_stats_tuple_record_size_counts.end(); ++it)
std::cerr << " " << it->first << " => " << it->second << std::endl;
std::cerr << "alloc size stats (nbytes => count)" << std::endl;
for (std::map<size_t, size_t>::iterator it = purge_stats_tuple_alloc_size_counts.begin();
it != purge_stats_tuple_alloc_size_counts.end(); ++it)
std::cerr << " " << (it->first + sizeof(dbtuple)) << " => " << it->second << std::endl;
//std::cerr << "chain stats (length => count)" << std::endl;
//for (std::map<size_t, size_t>::iterator it = purge_stats_tuple_chain_counts.begin();
// it != purge_stats_tuple_chain_counts.end(); ++it) {
// std::cerr << " " << it->first << " => " << it->second << std::endl;
// ret["chain_" + std::to_string(it->first)] += it->second;
//}
//std::cerr << "deleted recored stats" << std::endl;
//std::cerr << " logically_removed (total): " << (purge_stats_tuple_logically_removed_no_mark + purge_stats_tuple_logically_removed_with_mark) << std::endl;
//std::cerr << " logically_removed_no_mark: " << purge_stats_tuple_logically_removed_no_mark << std::endl;
//std::cerr << " logically_removed_with_mark: " << purge_stats_tuple_logically_removed_with_mark << std::endl;
return ret;
}
#endif
private:
std::vector< std::pair<typename concurrent_btree::value_type, bool> > spec_values;
};
protected:
// readers are placed here so they can be shared amongst
// derived implementations
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
struct txn_search_range_callback : public concurrent_btree::low_level_search_range_callback {
constexpr txn_search_range_callback(
Transaction<Traits> *t,
Callback *caller_callback,
KeyReader *key_reader,
ValueReader *value_reader)
: t(t), caller_callback(caller_callback),
key_reader(key_reader), value_reader(value_reader) {}
virtual void on_resp_node(const typename concurrent_btree::node_opaque_t *n, uint64_t version);
virtual bool invoke(const typename concurrent_btree::string_type &k, typename concurrent_btree::value_type v,
const typename concurrent_btree::node_opaque_t *n, uint64_t version);
private:
Transaction<Traits> *const t;
Callback *const caller_callback;
KeyReader *const key_reader;
ValueReader *const value_reader;
};
template <typename Traits, typename ValueReader>
inline bool
do_search(Transaction<Traits> &t,
const typename P::Key &k,
ValueReader &value_reader);
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
inline void
do_search_range_call(Transaction<Traits> &t,
const typename P::Key &lower,
const typename P::Key *upper,
Callback &callback,
KeyReader &key_reader,
ValueReader &value_reader);
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
inline void
do_rsearch_range_call(Transaction<Traits> &t,
const typename P::Key &upper,
const typename P::Key *lower,
Callback &callback,
KeyReader &key_reader,
ValueReader &value_reader);
// expect_new indicates if we expect the record to not exist in the tree-
// is just a hint that affects perf, not correctness. remove is put with nullptr
// as value.
//
// NOTE: both key and value are expected to be stable values already
template <typename Traits>
void do_tree_put(Transaction<Traits> &t,
const std::string *k,
const typename P::Value *v,
dbtuple::tuple_writer_t writer,
bool expect_new);
concurrent_btree underlying_btree;
size_type value_size_hint;
std::string name;
bool been_destructed;
};
namespace private_ {
STATIC_COUNTER_DECL(scopedperf::tsc_ctr, txn_btree_search_probe0, txn_btree_search_probe0_cg)
STATIC_COUNTER_DECL(scopedperf::tsc_ctr, txn_btree_search_probe1, txn_btree_search_probe1_cg)
}
template <template <typename> class Transaction, typename P>
template <typename Traits, typename ValueReader>
bool
base_txn_btree<Transaction, P>::do_search(
Transaction<Traits> &t,
const typename P::Key &k,
ValueReader &value_reader)
{
t.ensure_active();
typename P::KeyWriter key_writer(&k);
const std::string * const key_str =
key_writer.fully_materialize(true, t.string_allocator());
// search the underlying btree to map k=>(btree_node|tuple)
typename concurrent_btree::value_type underlying_v{};
concurrent_btree::versioned_node_t search_info;
const bool found = this->underlying_btree.search(varkey(*key_str), underlying_v, &search_info);
if (found) {
const dbtuple * const tuple = reinterpret_cast<const dbtuple *>(underlying_v);
return t.do_tuple_read(tuple, value_reader);
} else {
// not found, add to absent_set
t.do_node_read(search_info.first, search_info.second);
return false;
}
}
template <template <typename> class Transaction, typename P>
std::map<std::string, uint64_t>
base_txn_btree<Transaction, P>::unsafe_purge(bool dump_stats)
{
ALWAYS_ASSERT(!been_destructed);
been_destructed = true;
purge_tree_walker w;
scoped_rcu_region guard;
underlying_btree.tree_walk(w);
underlying_btree.clear();
#ifdef TXN_BTREE_DUMP_PURGE_STATS
if (!dump_stats)
return std::map<std::string, uint64_t>();
return w.dump_stats();
#else
return std::map<std::string, uint64_t>();
#endif
}
template <template <typename> class Transaction, typename P>
void
base_txn_btree<Transaction, P>::purge_tree_walker::on_node_begin(const typename concurrent_btree::node_opaque_t *n)
{
INVARIANT(spec_values.empty());
spec_values = concurrent_btree::ExtractValues(n);
}
template <template <typename> class Transaction, typename P>
void
base_txn_btree<Transaction, P>::purge_tree_walker::on_node_success()
{
for (size_t i = 0; i < spec_values.size(); i++) {
dbtuple *tuple = (dbtuple *) spec_values[i].first;
INVARIANT(tuple);
#ifdef TXN_BTREE_DUMP_PURGE_STATS
// XXX(stephentu): should we also walk the chain?
purge_stats_tuple_record_size_counts[tuple->is_deleting() ? 0 : tuple->size]++;
purge_stats_tuple_alloc_size_counts[tuple->alloc_size]++;
//purge_stats_tuple_chain_counts[tuple->chain_length()]++;
#endif
if (base_txn_btree_handler<Transaction>::has_background_task) {
#ifdef CHECK_INVARIANTS
lock_guard<dbtuple> l(tuple, false);
#endif
if (!tuple->is_deleting()) {
INVARIANT(tuple->is_latest());
tuple->clear_latest();
tuple->mark_deleting();
dbtuple::release(tuple);
} else {
// enqueued already to background gc by the writer of the delete
}
} else {
// XXX: this path is probably not right
dbtuple::release_no_rcu(tuple);
}
}
#ifdef TXN_BTREE_DUMP_PURGE_STATS
purge_stats_nkeys_node.push_back(spec_values.size());
purge_stats_nodes++;
for (size_t i = 0; i < spec_values.size(); i++)
if (spec_values[i].second)
goto done;
purge_stats_nosuffix_nodes++;
done:
#endif
spec_values.clear();
}
template <template <typename> class Transaction, typename P>
void
base_txn_btree<Transaction, P>::purge_tree_walker::on_node_failure()
{
spec_values.clear();
}
template <template <typename> class Transaction, typename P>
template <typename Traits>
void base_txn_btree<Transaction, P>::do_tree_put(
Transaction<Traits> &t,
const std::string *k,
const typename P::Value *v,
dbtuple::tuple_writer_t writer,
bool expect_new)
{
INVARIANT(k);
INVARIANT(!expect_new || v); // makes little sense to remove() a key you expect
// to not be present, so we assert this doesn't happen
// for now [since this would indicate a suboptimality]
t.ensure_active();
if (unlikely(t.is_snapshot())) {
const transaction_base::abort_reason r = transaction_base::ABORT_REASON_USER;
t.abort_impl(r);
throw transaction_abort_exception(r);
}
dbtuple *px = nullptr;
bool insert = false;
retry:
if (expect_new) {
auto ret = t.try_insert_new_tuple(this->underlying_btree, k, v, writer);
INVARIANT(!ret.second || ret.first);
if (unlikely(ret.second)) {
const transaction_base::abort_reason r = transaction_base::ABORT_REASON_WRITE_NODE_INTERFERENCE;
t.abort_impl(r);
throw transaction_abort_exception(r);
}
px = ret.first;
if (px)
insert = true;
}
if (!px) {
// do regular search
typename concurrent_btree::value_type bv = 0;
if (!this->underlying_btree.search(varkey(*k), bv)) {
// XXX(stephentu): if we are removing a key and we can't find it, then we
// should just treat this as a read [of an empty-value], instead of
// explicitly inserting an empty node...
expect_new = true;
goto retry;
}
px = reinterpret_cast<dbtuple *>(bv);
}
INVARIANT(px);
if (!insert) {
// add to write set normally, as non-insert
t.write_set.emplace_back(px, k, v, writer, &this->underlying_btree, false);
} else {
// should already exist in write set as insert
// (because of try_insert_new_tuple())
// too expensive to be a practical check
//INVARIANT(t.find_write_set(px) != t.write_set.end());
//INVARIANT(t.find_write_set(px)->is_insert());
}
}
template <template <typename> class Transaction, typename P>
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
void
base_txn_btree<Transaction, P>
::txn_search_range_callback<Traits, Callback, KeyReader, ValueReader>
::on_resp_node(
const typename concurrent_btree::node_opaque_t *n, uint64_t version)
{
VERBOSE(std::cerr << "on_resp_node(): <node=0x" << util::hexify(intptr_t(n))
<< ", version=" << version << ">" << std::endl);
VERBOSE(std::cerr << " " << concurrent_btree::NodeStringify(n) << std::endl);
t->do_node_read(n, version);
}
template <template <typename> class Transaction, typename P>
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
bool
base_txn_btree<Transaction, P>
::txn_search_range_callback<Traits, Callback, KeyReader, ValueReader>
::invoke(
const typename concurrent_btree::string_type &k, typename concurrent_btree::value_type v,
const typename concurrent_btree::node_opaque_t *n, uint64_t version)
{
t->ensure_active();
VERBOSE(std::cerr << "search range k: " << util::hexify(k) << " from <node=0x" << util::hexify(n)
<< ", version=" << version << ">" << std::endl
<< " " << *((dbtuple *) v) << std::endl);
const dbtuple * const tuple = reinterpret_cast<const dbtuple *>(v);
if (t->do_tuple_read(tuple, *value_reader))
return caller_callback->invoke(
(*key_reader)(k), value_reader->results());
return true;
}
template <template <typename> class Transaction, typename P>
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
void
base_txn_btree<Transaction, P>::do_search_range_call(
Transaction<Traits> &t,
const typename P::Key &lower,
const typename P::Key *upper,
Callback &callback,
KeyReader &key_reader,
ValueReader &value_reader)
{
t.ensure_active();
if (upper)
VERBOSE(std::cerr << "txn_btree(0x" << util::hexify(intptr_t(this))
<< ")::search_range_call [" << util::hexify(lower)
<< ", " << util::hexify(*upper) << ")" << std::endl);
else
VERBOSE(std::cerr << "txn_btree(0x" << util::hexify(intptr_t(this))
<< ")::search_range_call [" << util::hexify(lower)
<< ", +inf)" << std::endl);
typename P::KeyWriter lower_key_writer(&lower);
const std::string * const lower_str =
lower_key_writer.fully_materialize(true, t.string_allocator());
typename P::KeyWriter upper_key_writer(upper);
const std::string * const upper_str =
upper_key_writer.fully_materialize(true, t.string_allocator());
if (unlikely(upper_str && *upper_str <= *lower_str))
return;
txn_search_range_callback<Traits, Callback, KeyReader, ValueReader> c(
&t, &callback, &key_reader, &value_reader);
varkey uppervk;
if (upper_str)
uppervk = varkey(*upper_str);
this->underlying_btree.search_range_call(
varkey(*lower_str), upper_str ? &uppervk : nullptr,
c, t.string_allocator()());
}
template <template <typename> class Transaction, typename P>
template <typename Traits, typename Callback,
typename KeyReader, typename ValueReader>
void
base_txn_btree<Transaction, P>::do_rsearch_range_call(
Transaction<Traits> &t,
const typename P::Key &upper,
const typename P::Key *lower,
Callback &callback,
KeyReader &key_reader,
ValueReader &value_reader)
{
t.ensure_active();
typename P::KeyWriter lower_key_writer(lower);
const std::string * const lower_str =
lower_key_writer.fully_materialize(true, t.string_allocator());
typename P::KeyWriter upper_key_writer(&upper);
const std::string * const upper_str =
upper_key_writer.fully_materialize(true, t.string_allocator());
if (unlikely(lower_str && *upper_str <= *lower_str))
return;
txn_search_range_callback<Traits, Callback, KeyReader, ValueReader> c(
&t, &callback, &key_reader, &value_reader);
varkey lowervk;
if (lower_str)
lowervk = varkey(*lower_str);
this->underlying_btree.rsearch_range_call(
varkey(*upper_str), lower_str ? &lowervk : nullptr,
c, t.string_allocator()());
}
#endif /* _NDB_BASE_TXN_BTREE_H_ */