Skip to content

Commit 9b68e5c

Browse files
committed
arenaskl: optimize TrySeekUsingNext with skiplist tower traversal
Previously, TrySeekUsingNext would perform up to 5 simple Next() operations at level 0. If the target key wasn't found, all that work was abandoned and the iterator fell back to a full seek from the top of the skiplist. This wasted both the forward progress made and the information gained during those Next operations. The new implementation leverages the skiplist tower structure to make seeking more efficient. Instead of advancing only at level 0, it attempts to climb to higher levels when possible, allowing it to skip over many nodes efficiently. When a node >= target is found at a higher level, it descends through progressively lower levels to find the exact first node >= target at level 0. This approach better utilizes the skiplist's hierarchical structure for workloads with sequential access patterns. The optimization shows a modest improvement in the SeekPrefixGE benchmark, particularly as the skip distance between seeks increases. With `use-next=true`, performance improves by up to 5.5% for larger skips (`skip=16`), while shorter skips show smaller gains. ``` goos: darwin goarch: arm64 pkg: github.com/cockroachdb/pebble/internal/arenaskl cpu: Apple M2 Pro │ ./bench_master.txt │ ./bench_optimized.txt │ │ sec/op │ sec/op vs base │ SeekPrefixGE/skip=1/use-next=false-12 180.8n ± 2% 186.8n ± 2% +3.32% (p=0.015 n=6) SeekPrefixGE/skip=1/use-next=true-12 88.26n ± 10% 89.05n ± 3% ~ (p=0.699 n=6) SeekPrefixGE/skip=2/use-next=false-12 183.0n ± 5% 184.9n ± 2% ~ (p=1.000 n=6) SeekPrefixGE/skip=2/use-next=true-12 94.73n ± 2% 93.31n ± 0% -1.49% (p=0.002 n=6) SeekPrefixGE/skip=4/use-next=false-12 175.9n ± 4% 178.1n ± 1% ~ (p=0.065 n=6) SeekPrefixGE/skip=4/use-next=true-12 114.5n ± 1% 111.0n ± 3% -3.01% (p=0.006 n=6) SeekPrefixGE/skip=8/use-next-false-12 177.2n ± 5% 177.6n ± 3% ~ (p=0.615 n=6) SeekPrefixGE/skip=8/use-next=true-12 219.1n ± 1% 216.4n ± 3% ~ (p=0.485 n=6) SeekPrefixGE/skip=16/use-next-false-12 180.2n ± 2% 179.2n ± 3% ~ (p=0.288 n=6) SeekPrefixGE/skip=16/use-next=true-12 208.3n ± 3% 196.8n ± 6% -5.50% (p=0.009 n=6) geomean 155.2n 154.4n -0.53% │ ./bench_master.txt │ ./bench_optimized.txt │ │ B/op │ B/op vs base │ SeekPrefixGE/skip=1/use-next=false-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=1/use-next=true-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=2/use-next=false-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=2/use-next=true-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=4/use-next=false-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=4/use-next=true-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=8/use-next-false-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=8/use-next=true-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=16/use-next-false-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=16/use-next=true-12 22.00 ± 0% 22.00 ± 0% ~ (p=1.000 n=6) ¹ geomean 22.00 22.00 +0.00% ¹ all samples are equal │ ./bench_master.txt │ ./bench_optimized.txt │ │ allocs/op │ allocs/op vs base │ SeekPrefixGE/skip=1/use-next=false-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=1/use-next=true-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=2/use-next=false-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=2/use-next=true-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=4/use-next=false-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=4/use-next=true-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=8/use-next-false-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=8/use-next=true-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=16/use-next-false-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ SeekPrefixGE/skip=16/use-next=true-12 2.000 ± 0% 2.000 ± 0% ~ (p=1.000 n=6) ¹ geomean 2.000 2.000 +0.00% ¹ all samples are equal ``` Fixes #5358
1 parent 8dfb102 commit 9b68e5c

File tree

1 file changed

+144
-13
lines changed

1 file changed

+144
-13
lines changed

internal/arenaskl/iterator.go

Lines changed: 144 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -97,22 +97,27 @@ func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV
9797
return nil
9898
}
9999
less := it.list.cmp(it.kv.K.UserKey, key) < 0
100-
// Arbitrary constant. By measuring the seek cost as a function of the
101-
// number of elements in the skip list, and fitting to a model, we
102-
// could adjust the number of nexts based on the current size of the
103-
// skip list.
104-
const numNexts = 5
105-
kv := &it.kv
106-
for i := 0; less && i < numNexts; i++ {
107-
if kv = it.Next(); kv == nil {
108-
// Iterator is done.
100+
// Use a more efficient algorithm that takes advantage of the skiplist
101+
// tower structure. Instead of just doing Next operations at level 0,
102+
// we climb up levels when possible to skip over more nodes.
103+
const maxSteps = 5
104+
if less && it.trySeekUsingNextWithLevels(key, maxSteps) {
105+
// Found the key or determined we're positioned correctly.
106+
if it.nd == it.list.tail || it.nd == it.upperNode {
109107
return nil
110108
}
111-
less = it.list.cmp(kv.K.UserKey, key) < 0
112-
}
113-
if !less {
114-
return kv
109+
it.decodeKey()
110+
if it.upper != nil && it.list.cmp(it.upper, it.kv.K.UserKey) <= 0 {
111+
it.upperNode = it.nd
112+
return nil
113+
}
114+
it.kv.V = base.MakeInPlaceValue(it.value())
115+
return &it.kv
116+
} else if !less {
117+
// Current position already satisfies the seek.
118+
return &it.kv
115119
}
120+
// Fall through to full seek if we didn't find it quickly.
116121
}
117122
_, it.nd = it.seekForBaseSplice(key)
118123
if it.nd == it.list.tail || it.nd == it.upperNode {
@@ -259,6 +264,132 @@ func (it *Iterator) decodeKey() {
259264
it.kv.K.Trailer = it.nd.keyTrailer
260265
}
261266

267+
// trySeekUsingNextWithLevels implements a fast path for SeekGE when the
268+
// TrySeekUsingNext flag is set. It capitalizes on the assumption that for
269+
// sequential access patterns, the target 'key' is likely located shortly after
270+
// the iterator's current position 'it.nd'. Instead of initiating a full O(log N)
271+
// seek from the head of the skiplist, this function performs a bounded search
272+
// forward from the current node.
273+
func (it *Iterator) trySeekUsingNextWithLevels(key []byte, maxSteps int) bool {
274+
nd := it.nd
275+
level := 0
276+
maxLevel := int(it.list.Height()) - 1
277+
278+
// The strategy is a two-phase process:
279+
// 1. Advance using upper levels: The function attempts to "climb" the skiplist
280+
// tower from the current node, using the "express lanes" of higher levels to
281+
// skip over many nodes at once. It continues advancing the current node 'nd'
282+
// as long as the next node at a given level is less than the target key.
283+
// 2. Descend for precision: Once a 'next' node is found on an upper level that
284+
// is >= key (an "overshoot"), we know the target lies between the current 'nd'
285+
// and that 'next' node. The function then switches to a top-down search
286+
// ('searchDownToLevel0') from the current 'nd' to pinpoint the exact first
287+
// node >= key at level 0.
288+
//
289+
// This approach avoids the cost of a full seek for workloads that exhibit good
290+
// locality, turning a potential O(log N) operation into a near O(1) one. The
291+
// search is bounded by 'maxSteps' to prevent excessive work if the key is far away.
292+
//
293+
// Example: it.nd=(10), seek for key=(40)
294+
//
295+
// (1) Find overshoot at L2
296+
// nd=(10) -----------------------------> next=(50) (50 >= 40, overshoot!)
297+
// L2 o--->+-------+ +-------+<---o
298+
// | 10 | | 50 |
299+
// L1 o--->+-------+------>+-------+------------>+-------+<---o
300+
// | | | 20 | | |
301+
// L0 o-..->+-------+->..-->+-------+->..->+------>+-------+<---o
302+
// (2) Begin searchDownToLevel0 from nd=(10), startLevel=2
303+
//
304+
// (3) Descend to L1. From (10), next is (20). (20 < 40), so advance nd to (20).
305+
//
306+
// nd=(20) ------------> next=(50) (50 >= 40)
307+
// L1 o--->+-------+------>+-------+------------>+-------+<---o
308+
// | 10 | | 20 | | 50 |
309+
// L0 o-..->+-------+->..-->+-------+->..->+------>+-------+<---o
310+
// (4) Cannot advance on L1. Descend to L0 from (20).
311+
//
312+
// (5) At L0, advance nd from (20) -> (25) -> (30).
313+
// (6) At nd=(30), next is (40). (40 >= 40). Stop.
314+
// (7) Return 'next', which is the target node (40).
315+
for step := 0; step < maxSteps; step++ {
316+
// Try to advance at the current level or higher levels.
317+
advanced := false
318+
for candidateLevel := level; candidateLevel <= maxLevel; candidateLevel++ {
319+
next := it.list.getNext(nd, candidateLevel)
320+
if next == it.list.tail {
321+
// Can't advance at this level, try next higher level.
322+
continue
323+
}
324+
325+
// Check if next.key >= key.
326+
offset, size := next.keyOffset, next.keySize
327+
nextKey := it.list.arena.buf[offset : offset+size]
328+
cmp := it.list.cmp(key, nextKey)
329+
330+
if cmp <= 0 {
331+
// Found a node >= key at candidateLevel.
332+
// Descend to level 0 to find the exact first node >= key.
333+
if candidateLevel == 0 {
334+
it.nd = next
335+
return true
336+
}
337+
// Search down from nd (which is < key) to find first node >= key.
338+
it.nd = it.searchDownToLevel0(nd, candidateLevel, key)
339+
return true
340+
}
341+
342+
// next.key < key at candidateLevel. If we're at a higher level,
343+
// this means we can skip many nodes at level 0. Advance and stay
344+
// at this higher level.
345+
nd = next
346+
level = candidateLevel
347+
advanced = true
348+
break
349+
}
350+
351+
// If we couldn't advance at any level, we're done.
352+
if !advanced {
353+
break
354+
}
355+
}
356+
357+
// Didn't find the key within maxSteps, but update position.
358+
it.nd = nd
359+
return false
360+
}
361+
362+
// searchDownToLevel0 descends from the given node and level down to level 0,
363+
// searching for the first node >= key. It starts at 'nd' which is known to be < key,
364+
// and searches forward at progressively lower levels.
365+
func (it *Iterator) searchDownToLevel0(nd *node, startLevel int, key []byte) *node {
366+
for level := startLevel - 1; level >= 0; level-- {
367+
for {
368+
next := it.list.getNext(nd, level)
369+
if next == it.list.tail {
370+
// Reached end at this level, move down.
371+
break
372+
}
373+
374+
offset, size := next.keyOffset, next.keySize
375+
nextKey := it.list.arena.buf[offset : offset+size]
376+
cmp := it.list.cmp(key, nextKey)
377+
if cmp <= 0 {
378+
// Found a node >= key at this level. If we're at level 0, return it.
379+
if level == 0 {
380+
return next
381+
}
382+
// Otherwise, move down to continue searching.
383+
break
384+
}
385+
// next.key < key, keep advancing at this level.
386+
nd = next
387+
}
388+
}
389+
// If we get here, nd is the last node < key, so return the next node at level 0.
390+
return it.list.getNext(nd, 0)
391+
}
392+
262393
func (it *Iterator) seekForBaseSplice(key []byte) (prev, next *node) {
263394
prev = it.list.head
264395
for level := int(it.list.Height() - 1); level >= 0; level-- {

0 commit comments

Comments
 (0)