-
Notifications
You must be signed in to change notification settings - Fork 458
/
range_keys.go
737 lines (688 loc) · 29.3 KB
/
range_keys.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
// Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package pebble
import (
"context"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/invariants"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/treeprinter"
"github.com/cockroachdb/pebble/sstable"
)
// constructRangeKeyIter constructs the range-key iterator stack, populating
// i.rangeKey.rangeKeyIter with the resulting iterator.
func (i *Iterator) constructRangeKeyIter() {
i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
&i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
&i.hasPrefix, &i.prefixOrFullSeekKey, false /* internalKeys */, &i.rangeKey.rangeKeyBuffers.internal)
if i.opts.DebugRangeKeyStack {
// The default logger is preferable to i.opts.getLogger(), at least in the
// metamorphic test.
i.rangeKey.rangeKeyIter = keyspan.InjectLogging(i.rangeKey.rangeKeyIter, base.DefaultLogger)
}
// If there's an indexed batch with range keys, include it.
if i.batch != nil {
if i.batch.index == nil {
// This isn't an indexed batch. We shouldn't have gotten this far.
panic(errors.AssertionFailedf("creating an iterator over an unindexed batch"))
} else {
// Only include the batch's range key iterator if it has any keys.
// NB: This can force reconstruction of the rangekey iterator stack
// in SetOptions if subsequently range keys are added. See
// SetOptions.
if i.batch.countRangeKeys > 0 {
i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, i.batchSeqNum)
i.rangeKey.iterConfig.AddLevel(&i.batchRangeKeyIter)
}
}
}
if !i.batchOnlyIter {
// Next are the flushables: memtables and large batches.
if i.readState != nil {
for j := len(i.readState.memtables) - 1; j >= 0; j-- {
mem := i.readState.memtables[j]
// We only need to read from memtables which contain sequence numbers older
// than seqNum.
if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
continue
}
if rki := mem.newRangeKeyIter(&i.opts); rki != nil {
i.rangeKey.iterConfig.AddLevel(rki)
}
}
}
current := i.version
if current == nil {
current = i.readState.current
}
// Next are the file levels: L0 sub-levels followed by lower levels.
// Add file-specific iterators for L0 files containing range keys. We
// maintain a separate manifest.LevelMetadata for each level containing only
// files that contain range keys, however we don't compute a separate
// L0Sublevels data structure too.
//
// We first use L0's LevelMetadata to peek and see whether L0 contains any
// range keys at all. If it does, we create a range key level iterator per
// level that contains range keys using the information from L0Sublevels.
// Some sublevels may not contain any range keys, and we need to iterate
// through the fileMetadata to determine that. Since L0's file count should
// not significantly exceed ~1000 files (see L0CompactionFileThreshold),
// this should be okay.
if !current.RangeKeyLevels[0].Empty() {
// L0 contains at least 1 file containing range keys.
// Add level iterators for the L0 sublevels, iterating from newest to
// oldest.
for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- {
iter := current.L0SublevelFiles[j].Iter()
if !containsAnyRangeKeys(iter) {
continue
}
li := i.rangeKey.iterConfig.NewLevelIter()
li.Init(
i.ctx,
i.opts.SpanIterOptions(),
i.cmp,
i.newIterRangeKey,
iter.Filter(manifest.KeyTypeRange),
manifest.L0Sublevel(j),
manifest.KeyTypeRange,
)
i.rangeKey.iterConfig.AddLevel(li)
}
}
// Add level iterators for the non-empty non-L0 levels.
for level := 1; level < len(current.RangeKeyLevels); level++ {
if current.RangeKeyLevels[level].Empty() {
continue
}
li := i.rangeKey.iterConfig.NewLevelIter()
spanIterOpts := i.opts.SpanIterOptions()
li.Init(i.ctx, spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
manifest.Level(level), manifest.KeyTypeRange)
i.rangeKey.iterConfig.AddLevel(li)
}
}
}
func containsAnyRangeKeys(iter manifest.LevelIterator) bool {
for f := iter.First(); f != nil; f = iter.Next() {
if f.HasRangeKeys {
return true
}
}
return false
}
// Range key masking
//
// Pebble iterators may be configured such that range keys with suffixes mask
// point keys with lower suffixes. The intended use is implementing a MVCC
// delete range operation using range keys, when suffixes are MVCC timestamps.
//
// To enable masking, the user populates the IterOptions's RangeKeyMasking
// field. The Suffix field configures which range keys act as masks. The
// intended use is to hold a MVCC read timestamp. When implementing a MVCC
// delete range operation, only range keys that are visible at the read
// timestamp should be visible. If a range key has a suffix ≤
// RangeKeyMasking.Suffix, it acts as a mask.
//
// Range key masking is facilitated by the keyspan.InterleavingIter. The
// interleaving iterator interleaves range keys and point keys during combined
// iteration. During user iteration, the interleaving iterator is configured
// with a keyspan.SpanMask, implemented by the rangeKeyMasking struct below.
// The SpanMask interface defines two methods: SpanChanged and SkipPoint.
//
// SpanChanged is used to keep the current mask up-to-date. Whenever the point
// iterator has stepped into or out of the bounds of a range key, the
// interleaving iterator invokes SpanChanged passing the current covering range
// key. The below rangeKeyMasking implementation scans the range keys looking
// for the range key with the largest suffix that's still ≤ the suffix supplied
// to IterOptions.RangeKeyMasking.Suffix (the "read timestamp"). If it finds a
// range key that meets the condition, the range key should act as a mask. The
// span and the relevant range key's suffix are saved.
//
// The above ensures that `rangeKeyMasking.maskActiveSuffix` always contains the
// current masking suffix such that any point keys with lower suffixes should be
// skipped.
//
// There are two ways in which masked point keys are skipped.
//
// 1. Interleaving iterator SkipPoint
//
// Whenever the interleaving iterator encounters a point key that falls within
// the bounds of a range key, it invokes SkipPoint. The interleaving iterator
// guarantees that the SpanChanged method described above has already been
// invoked with the covering range key. The below rangeKeyMasking implementation
// of SkipPoint splits the key into prefix and suffix, compares the suffix to
// the `maskActiveSuffix` updated by SpanChanged and returns true if
// suffix(point) < maskActiveSuffix.
//
// The SkipPoint logic is sufficient to ensure that the Pebble iterator filters
// out all masked point keys. However, it requires the iterator read each masked
// point key. For broad range keys that mask many points, this may be expensive.
//
// 2. Block property filter
//
// For more efficient handling of braad range keys that mask many points, the
// IterOptions.RangeKeyMasking field has an optional Filter option. This Filter
// field takes a superset of the block-property filter interface, adding a
// method to dynamically configure the filter's filtering criteria.
//
// To make use of the Filter option, the user is required to define and
// configure a block-property collector that collects a property containing at
// least the maximum suffix of a key within a block.
//
// When the SpanChanged method described above is invoked, rangeKeyMasking also
// reconfigures the user-provided filter. It invokes a SetSuffix method,
// providing the `maskActiveSuffix`, requesting that from now on the
// block-property filter return Intersects()=false for any properties indicating
// that a block contains exclusively keys with suffixes greater than the
// provided suffix.
//
// Note that unlike other block-property filters, the filter used for masking
// must not apply across the entire keyspace. It must only filter blocks that
// lie within the bounds of the range key that set the mask suffix. To
// accommodate this, rangeKeyMasking implements a special interface:
// sstable.BoundLimitedBlockPropertyFilter. This interface extends the block
// property filter interface with two new methods: KeyIsWithinLowerBound and
// KeyIsWithinUpperBound. The rangeKeyMasking type wraps the user-provided block
// property filter, implementing these two methods and overriding Intersects to
// always return true if there is no active mask.
//
// The logic to ensure that a mask block-property filter is only applied within
// the bounds of the masking range key is subtle. The interleaving iterator
// guarantees that it never invokes SpanChanged until the point iterator is
// positioned within the range key. During forward iteration, this guarantees
// that any block that a sstable reader might attempt to load contains only keys
// greater than or equal to the range key's lower bound. During backward
// iteration, it provides the analagous guarantee on the range key's upper
// bound.
//
// The above ensures that an sstable reader only needs to verify that a block
// that it skips meets the opposite bound. This is where the
// KeyIsWithinLowerBound and KeyIsWithinUpperBound methods are used. When an
// sstable iterator is configured with a BoundLimitedBlockPropertyFilter, it
// checks for intersection with the block-property filter before every block
// load, like ordinary block-property filters. However, if the bound-limited
// block property filter indicates that it does NOT intersect, the filter's
// relevant KeyIsWithin{Lower,Upper}Bound method is queried, using a block
// index separator as the bound. If the method indicates that the provided index
// separator does not fall within the range key bounds, the no-intersection
// result is ignored, and the block is read.
type rangeKeyMasking struct {
cmp base.Compare
suffixCmp base.CompareRangeSuffixes
split base.Split
filter BlockPropertyFilterMask
// maskActiveSuffix holds the suffix of a range key currently acting as a
// mask, hiding point keys with suffixes greater than it. maskActiveSuffix
// is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil.
// maskActiveSuffix is updated whenever the iterator passes over a new range
// key. The maskActiveSuffix should only be used if maskSpan is non-nil.
//
// See SpanChanged.
maskActiveSuffix []byte
// maskSpan holds the span from which the active mask suffix was extracted.
// The span is used for bounds comparisons, to ensure that a range-key mask
// is not applied beyond the bounds of the range key.
maskSpan *keyspan.Span
parent *Iterator
}
func (m *rangeKeyMasking) init(parent *Iterator, c *base.Comparer) {
m.cmp = c.Compare
m.suffixCmp = c.CompareRangeSuffixes
m.split = c.Split
if parent.opts.RangeKeyMasking.Filter != nil {
m.filter = parent.opts.RangeKeyMasking.Filter()
}
m.parent = parent
}
// SpanChanged implements the keyspan.SpanMask interface, used during range key
// iteration.
func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) {
if s == nil && m.maskSpan == nil {
return
}
m.maskSpan = nil
m.maskActiveSuffix = m.maskActiveSuffix[:0]
// Find the smallest suffix of a range key contained within the Span,
// excluding suffixes less than m.opts.RangeKeyMasking.Suffix.
if s != nil {
m.parent.rangeKey.stale = true
if m.parent.opts.RangeKeyMasking.Suffix != nil {
for j := range s.Keys {
if s.Keys[j].Suffix == nil {
continue
}
if m.suffixCmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 {
continue
}
if len(m.maskActiveSuffix) == 0 || m.suffixCmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 {
m.maskSpan = s
m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...)
}
}
}
}
if m.maskSpan != nil && m.parent.opts.RangeKeyMasking.Filter != nil {
// Update the block-property filter to filter point keys with suffixes
// greater than m.maskActiveSuffix.
err := m.filter.SetSuffix(m.maskActiveSuffix)
if err != nil {
m.parent.err = err
}
}
// If no span is active, we leave the inner block-property filter configured
// with its existing suffix. That's okay, because Intersects calls are first
// evaluated by iteratorRangeKeyState.Intersects, which considers all blocks
// as intersecting if there's no active mask.
}
// SkipPoint implements the keyspan.SpanMask interface, used during range key
// iteration. Whenever a point key is covered by a non-empty Span, the
// interleaving iterator invokes SkipPoint. This function is responsible for
// performing range key masking.
//
// If a non-nil IterOptions.RangeKeyMasking.Suffix is set, range key masking is
// enabled. Masking hides point keys, transparently skipping over the keys.
// Whether or not a point key is masked is determined by comparing the point
// key's suffix, the overlapping span's keys' suffixes, and the user-configured
// IterOption's RangeKeyMasking.Suffix. When configured with a masking threshold
// _t_, and there exists a span with suffix _r_ covering a point key with suffix
// _p_, and
//
// _t_ ≤ _r_ < _p_
//
// then the point key is elided. Consider the following rendering, where using
// integer suffixes with higher integers sort before suffixes with lower
// integers, (for example @7 ≤ @6 < @5):
//
// ^
// @9 | •―――――――――――――――○ [e,m)@9
// s 8 | • l@8
// u 7 |------------------------------------ @7 RangeKeyMasking.Suffix
// f 6 | [h,q)@6 •―――――――――――――――――○ (threshold)
// f 5 | • h@5
// f 4 | • n@4
// i 3 | •―――――――――――○ [f,l)@3
// x 2 | • b@2
// 1 |
// 0 |___________________________________
// a b c d e f g h i j k l m n o p q
//
// An iterator scanning the entire keyspace with the masking threshold set to @7
// will observe point keys b@2 and l@8. The span keys [h,q)@6 and [f,l)@3 serve
// as masks, because cmp(@6,@7) ≥ 0 and cmp(@3,@7) ≥ 0. The span key [e,m)@9
// does not serve as a mask, because cmp(@9,@7) < 0.
//
// Although point l@8 falls within the user key bounds of [e,m)@9, [e,m)@9 is
// non-masking due to its suffix. The point key l@8 also falls within the user
// key bounds of [h,q)@6, but since cmp(@6,@8) ≥ 0, l@8 is unmasked.
//
// Invariant: The userKey is within the user key bounds of the span most
// recently provided to `SpanChanged`.
func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool {
m.parent.stats.RangeKeyStats.ContainedPoints++
if m.maskSpan == nil {
// No range key is currently acting as a mask, so don't skip.
return false
}
// Range key masking is enabled and the current span includes a range key
// that is being used as a mask. (NB: SpanChanged already verified that the
// range key's suffix is ≥ RangeKeyMasking.Suffix).
//
// This point key falls within the bounds of the range key (guaranteed by
// the InterleavingIter). Skip the point key if the range key's suffix is
// greater than the point key's suffix.
pointSuffix := userKey[m.split(userKey):]
if len(pointSuffix) > 0 && m.suffixCmp(m.maskActiveSuffix, pointSuffix) < 0 {
m.parent.stats.RangeKeyStats.SkippedPoints++
return true
}
return false
}
// The iteratorRangeKeyState type implements the sstable package's
// BoundLimitedBlockPropertyFilter interface in order to use block property
// filters for range key masking. The iteratorRangeKeyState implementation wraps
// the block-property filter provided in Options.RangeKeyMasking.Filter.
//
// Using a block-property filter for range-key masking requires limiting the
// filter's effect to the bounds of the range key currently acting as a mask.
// Consider the range key [a,m)@10, and an iterator positioned just before the
// below block, bounded by index separators `c` and `z`:
//
// c z
// x | c@9 c@5 c@1 d@7 e@4 y@4 | ...
// iter pos
//
// The next block cannot be skipped, despite the range key suffix @10 is greater
// than all the block's keys' suffixes, because it contains a key (y@4) outside
// the bounds of the range key.
//
// This extended BoundLimitedBlockPropertyFilter interface adds two new methods,
// KeyIsWithinLowerBound and KeyIsWithinUpperBound, for testing whether a
// particular block is within bounds.
//
// The iteratorRangeKeyState implements these new methods by first checking if
// the iterator is currently positioned within a range key. If not, the provided
// key is considered out-of-bounds. If the iterator is positioned within a range
// key, it compares the corresponding range key bound.
var _ sstable.BoundLimitedBlockPropertyFilter = (*rangeKeyMasking)(nil)
// Name implements the limitedBlockPropertyFilter interface defined in the
// sstable package by passing through to the user-defined block property filter.
func (m *rangeKeyMasking) Name() string {
return m.filter.Name()
}
// Intersects implements the limitedBlockPropertyFilter interface defined in the
// sstable package by passing the intersection decision to the user-provided
// block property filter only if a range key is covering the current iterator
// position.
func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) {
if m.maskSpan == nil {
// No span is actively masking.
return true, nil
}
return m.filter.Intersects(prop)
}
func (m *rangeKeyMasking) SyntheticSuffixIntersects(prop []byte, suffix []byte) (bool, error) {
if m.maskSpan == nil {
// No span is actively masking.
return true, nil
}
return m.filter.SyntheticSuffixIntersects(prop, suffix)
}
// KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface
// defined in the sstable package. It's used to restrict the masking block
// property filter to only applying within the bounds of the active range key.
func (m *rangeKeyMasking) KeyIsWithinLowerBound(key []byte) bool {
// Invariant: m.maskSpan != nil
//
// The provided `key` is an inclusive lower bound of the block we're
// considering skipping.
return m.cmp(m.maskSpan.Start, key) <= 0
}
// KeyIsWithinUpperBound implements the limitedBlockPropertyFilter interface
// defined in the sstable package. It's used to restrict the masking block
// property filter to only applying within the bounds of the active range key.
func (m *rangeKeyMasking) KeyIsWithinUpperBound(key []byte) bool {
// Invariant: m.maskSpan != nil
//
// The provided `key` is an *inclusive* upper bound of the block we're
// considering skipping, so the range key's end must be strictly greater
// than the block bound for the block to be within bounds.
return m.cmp(m.maskSpan.End, key) > 0
}
// lazyCombinedIter implements the internalIterator interface, wrapping a
// pointIter. It requires the pointIter's the levelIters be configured with
// pointers to its combinedIterState. When the levelIter observes a file
// containing a range key, the lazyCombinedIter constructs the combined
// range+point key iterator stack and switches to it.
type lazyCombinedIter struct {
// parent holds a pointer to the root *pebble.Iterator containing this
// iterator. It's used to mutate the internalIterator in use when switching
// to combined iteration.
parent *Iterator
pointIter internalIterator
combinedIterState combinedIterState
}
// combinedIterState encapsulates the current state of combined iteration.
// Various low-level iterators (mergingIter, leveliter) hold pointers to the
// *pebble.Iterator's combinedIterState. This allows them to check whether or
// not they must monitor for files containing range keys (!initialized), or not.
//
// When !initialized, low-level iterators watch for files containing range keys.
// When one is discovered, they set triggered=true and key to the smallest
// (forward direction) or largest (reverse direction) range key that's been
// observed.
type combinedIterState struct {
// key holds the smallest (forward direction) or largest (backward
// direction) user key from a range key bound discovered during the iterator
// operation that triggered the switch to combined iteration.
//
// Slices stored here must be stable. This is possible because callers pass
// a Smallest/Largest bound from a fileMetadata, which are immutable. A key
// slice's bytes must not be overwritten.
key []byte
triggered bool
initialized bool
}
// Assert that *lazyCombinedIter implements internalIterator.
var _ internalIterator = (*lazyCombinedIter)(nil)
// initCombinedIteration is invoked after a pointIter positioning operation
// resulted in i.combinedIterState.triggered=true.
//
// The `dir` parameter is `+1` or `-1` indicating forward iteration or backward
// iteration respectively.
//
// The `pointKey` and `pointValue` parameters provide the new point key-value
// pair that the iterator was just positioned to. The combined iterator should
// be seeded with this point key-value pair and return the smaller (forward
// iteration) or largest (backward iteration) of the two.
//
// The `seekKey` parameter is non-nil only if the iterator operation that
// triggered the switch to combined iteration was a SeekGE, SeekPrefixGE or
// SeekLT. It provides the seek key supplied and is used to seek the range-key
// iterator using the same key. This is necessary for SeekGE/SeekPrefixGE
// operations that land in the middle of a range key and must truncate to the
// user-provided seek key.
func (i *lazyCombinedIter) initCombinedIteration(
dir int8, pointKV *base.InternalKV, seekKey []byte,
) *base.InternalKV {
// Invariant: i.parent.rangeKey is nil.
// Invariant: !i.combinedIterState.initialized.
if invariants.Enabled {
if i.combinedIterState.initialized {
panic("pebble: combined iterator already initialized")
}
if i.parent.rangeKey != nil {
panic("pebble: iterator already has a range-key iterator stack")
}
}
// We need to determine the key to seek the range key iterator to. If
// seekKey is not nil, the user-initiated operation that triggered the
// switch to combined iteration was itself a seek, and we can use that key.
// Otherwise, a First/Last or relative positioning operation triggered the
// switch to combined iteration.
//
// The levelIter that observed a file containing range keys populated
// combinedIterState.key with the smallest (forward) or largest (backward)
// range key it observed. If multiple levelIters observed files with range
// keys during the same operation on the mergingIter, combinedIterState.key
// is the smallest [during forward iteration; largest in reverse iteration]
// such key.
if seekKey == nil {
// Use the levelIter-populated key.
seekKey = i.combinedIterState.key
// We may need to adjust the levelIter-populated seek key to the
// surfaced point key. If the key observed is beyond [in the iteration
// direction] the current point key, there may still exist a range key
// at an earlier key. Consider the following example:
//
// L5: 000003:[bar.DEL.5, foo.RANGEKEYSET.9]
// L6: 000001:[bar.SET.2] 000002:[bax.RANGEKEYSET.8]
//
// A call to First() seeks the levels to files L5.000003 and L6.000001.
// The L5 levelIter observes that L5.000003 contains the range key with
// start key `foo`, and triggers a switch to combined iteration, setting
// `combinedIterState.key` = `foo`.
//
// The L6 levelIter did not observe the true first range key
// (bax.RANGEKEYSET.8), because it appears in a later sstable. When the
// combined iterator is initialized, the range key iterator must be
// seeked to a key that will find `bax`. To accomplish this, we seek the
// key instead to `bar`. It is guaranteed that no range key exists
// earlier than `bar`, otherwise a levelIter would've observed it and
// set `combinedIterState.key` to its start key.
if pointKV != nil {
if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKV.K.UserKey) > 0 {
seekKey = pointKV.K.UserKey
} else if dir == -1 && i.parent.cmp(seekKey, pointKV.K.UserKey) < 0 {
seekKey = pointKV.K.UserKey
}
}
}
// An operation on the point iterator observed a file containing range keys,
// so we must switch to combined interleaving iteration. First, construct
// the range key iterator stack. It must not exist, otherwise we'd already
// be performing combined iteration.
i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
i.parent.rangeKey.init(i.parent.comparer.Compare, i.parent.comparer.Split, &i.parent.opts)
i.parent.constructRangeKeyIter()
// Initialize the Iterator's interleaving iterator.
i.parent.rangeKey.iiter.Init(
&i.parent.comparer, i.parent.pointIter, i.parent.rangeKey.rangeKeyIter,
keyspan.InterleavingIterOpts{
Mask: &i.parent.rangeKeyMasking,
LowerBound: i.parent.opts.LowerBound,
UpperBound: i.parent.opts.UpperBound,
})
// Set the parent's primary iterator to point to the combined, interleaving
// iterator that's now initialized with our current state.
i.parent.iter = &i.parent.rangeKey.iiter
i.combinedIterState.initialized = true
i.combinedIterState.key = nil
// All future iterator operations will go directly through the combined
// iterator.
//
// Initialize the interleaving iterator. We pass the point key-value pair so
// that the interleaving iterator knows where the point iterator is
// positioned. Additionally, we pass the seek key to which the range-key
// iterator should be seeked in order to initialize its position.
//
// In the forward direction (invert for backwards), the seek key is a key
// guaranteed to find the smallest range key that's greater than the last
// key the iterator returned. The range key may be less than pointKV, in
// which case the range key will be interleaved next instead of the point
// key.
if dir == +1 {
var prefix []byte
if i.parent.hasPrefix {
prefix = i.parent.prefixOrFullSeekKey
}
return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKV)
}
return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKV)
}
func (i *lazyCombinedIter) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.SeekGE(key, flags)
}
kv := i.pointIter.SeekGE(key, flags)
if i.combinedIterState.triggered {
return i.initCombinedIteration(+1, kv, key)
}
return kv
}
func (i *lazyCombinedIter) SeekPrefixGE(
prefix, key []byte, flags base.SeekGEFlags,
) *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags)
}
kv := i.pointIter.SeekPrefixGE(prefix, key, flags)
if i.combinedIterState.triggered {
return i.initCombinedIteration(+1, kv, key)
}
return kv
}
func (i *lazyCombinedIter) SeekLT(key []byte, flags base.SeekLTFlags) *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.SeekLT(key, flags)
}
kv := i.pointIter.SeekLT(key, flags)
if i.combinedIterState.triggered {
return i.initCombinedIteration(-1, kv, key)
}
return kv
}
func (i *lazyCombinedIter) First() *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.First()
}
kv := i.pointIter.First()
if i.combinedIterState.triggered {
return i.initCombinedIteration(+1, kv, nil)
}
return kv
}
func (i *lazyCombinedIter) Last() *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.Last()
}
kv := i.pointIter.Last()
if i.combinedIterState.triggered {
return i.initCombinedIteration(-1, kv, nil)
}
return kv
}
func (i *lazyCombinedIter) Next() *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.Next()
}
kv := i.pointIter.Next()
if i.combinedIterState.triggered {
return i.initCombinedIteration(+1, kv, nil)
}
return kv
}
func (i *lazyCombinedIter) NextPrefix(succKey []byte) *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.NextPrefix(succKey)
}
kv := i.pointIter.NextPrefix(succKey)
if i.combinedIterState.triggered {
return i.initCombinedIteration(+1, kv, nil)
}
return kv
}
func (i *lazyCombinedIter) Prev() *base.InternalKV {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.Prev()
}
kv := i.pointIter.Prev()
if i.combinedIterState.triggered {
return i.initCombinedIteration(-1, kv, nil)
}
return kv
}
func (i *lazyCombinedIter) Error() error {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.Error()
}
return i.pointIter.Error()
}
func (i *lazyCombinedIter) Close() error {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.Close()
}
return i.pointIter.Close()
}
func (i *lazyCombinedIter) SetBounds(lower, upper []byte) {
if i.combinedIterState.initialized {
i.parent.rangeKey.iiter.SetBounds(lower, upper)
return
}
i.pointIter.SetBounds(lower, upper)
}
func (i *lazyCombinedIter) SetContext(ctx context.Context) {
if i.combinedIterState.initialized {
i.parent.rangeKey.iiter.SetContext(ctx)
return
}
i.pointIter.SetContext(ctx)
}
// DebugTree is part of the InternalIterator interface.
func (i *lazyCombinedIter) DebugTree(tp treeprinter.Node) {
n := tp.Childf("%T(%p)", i, i)
if i.combinedIterState.initialized {
i.parent.rangeKey.iiter.DebugTree(n)
} else {
i.pointIter.DebugTree(n)
}
}
func (i *lazyCombinedIter) String() string {
if i.combinedIterState.initialized {
return i.parent.rangeKey.iiter.String()
}
return i.pointIter.String()
}