@@ -97,22 +97,27 @@ func (it *Iterator) SeekGE(key []byte, flags base.SeekGEFlags) *base.InternalKV
9797 return nil
9898 }
9999 less := it .list .cmp (it .kv .K .UserKey , key ) < 0
100- // Arbitrary constant. By measuring the seek cost as a function of the
101- // number of elements in the skip list, and fitting to a model, we
102- // could adjust the number of nexts based on the current size of the
103- // skip list.
104- const numNexts = 5
105- kv := & it .kv
106- for i := 0 ; less && i < numNexts ; i ++ {
107- if kv = it .Next (); kv == nil {
108- // Iterator is done.
100+ // Use a more efficient algorithm that takes advantage of the skiplist
101+ // tower structure. Instead of just doing Next operations at level 0,
102+ // we climb up levels when possible to skip over more nodes.
103+ const maxSteps = 5
104+ if less && it .trySeekUsingNextWithLevels (key , maxSteps ) {
105+ // Found the key or determined we're positioned correctly.
106+ if it .nd == it .list .tail || it .nd == it .upperNode {
109107 return nil
110108 }
111- less = it .list .cmp (kv .K .UserKey , key ) < 0
112- }
113- if ! less {
114- return kv
109+ it .decodeKey ()
110+ if it .upper != nil && it .list .cmp (it .upper , it .kv .K .UserKey ) <= 0 {
111+ it .upperNode = it .nd
112+ return nil
113+ }
114+ it .kv .V = base .MakeInPlaceValue (it .value ())
115+ return & it .kv
116+ } else if ! less {
117+ // Current position already satisfies the seek.
118+ return & it .kv
115119 }
120+ // Fall through to full seek if we didn't find it quickly.
116121 }
117122 _ , it .nd = it .seekForBaseSplice (key )
118123 if it .nd == it .list .tail || it .nd == it .upperNode {
@@ -259,6 +264,132 @@ func (it *Iterator) decodeKey() {
259264 it .kv .K .Trailer = it .nd .keyTrailer
260265}
261266
267+ // trySeekUsingNextWithLevels implements a fast path for SeekGE when the
268+ // TrySeekUsingNext flag is set. It capitalizes on the assumption that for
269+ // sequential access patterns, the target 'key' is likely located shortly after
270+ // the iterator's current position 'it.nd'. Instead of initiating a full O(log N)
271+ // seek from the head of the skiplist, this function performs a bounded search
272+ // forward from the current node.
273+ func (it * Iterator ) trySeekUsingNextWithLevels (key []byte , maxSteps int ) bool {
274+ nd := it .nd
275+ level := 0
276+ maxLevel := int (it .list .Height ()) - 1
277+
278+ // The strategy is a two-phase process:
279+ // 1. Advance using upper levels: The function attempts to "climb" the skiplist
280+ // tower from the current node, using the "express lanes" of higher levels to
281+ // skip over many nodes at once. It continues advancing the current node 'nd'
282+ // as long as the next node at a given level is less than the target key.
283+ // 2. Descend for precision: Once a 'next' node is found on an upper level that
284+ // is >= key (an "overshoot"), we know the target lies between the current 'nd'
285+ // and that 'next' node. The function then switches to a top-down search
286+ // ('searchDownToLevel0') from the current 'nd' to pinpoint the exact first
287+ // node >= key at level 0.
288+ //
289+ // This approach avoids the cost of a full seek for workloads that exhibit good
290+ // locality, turning a potential O(log N) operation into a near O(1) one. The
291+ // search is bounded by 'maxSteps' to prevent excessive work if the key is far away.
292+ //
293+ // Example: it.nd=(10), seek for key=(40)
294+ //
295+ // (1) Find overshoot at L2
296+ // nd=(10) -----------------------------> next=(50) (50 >= 40, overshoot!)
297+ // L2 o--->+-------+ +-------+<---o
298+ // | 10 | | 50 |
299+ // L1 o--->+-------+------>+-------+------------>+-------+<---o
300+ // | | | 20 | | |
301+ // L0 o-..->+-------+->..-->+-------+->..->+------>+-------+<---o
302+ // (2) Begin searchDownToLevel0 from nd=(10), startLevel=2
303+ //
304+ // (3) Descend to L1. From (10), next is (20). (20 < 40), so advance nd to (20).
305+ //
306+ // nd=(20) ------------> next=(50) (50 >= 40)
307+ // L1 o--->+-------+------>+-------+------------>+-------+<---o
308+ // | 10 | | 20 | | 50 |
309+ // L0 o-..->+-------+->..-->+-------+->..->+------>+-------+<---o
310+ // (4) Cannot advance on L1. Descend to L0 from (20).
311+ //
312+ // (5) At L0, advance nd from (20) -> (25) -> (30).
313+ // (6) At nd=(30), next is (40). (40 >= 40). Stop.
314+ // (7) Return 'next', which is the target node (40).
315+ for step := 0 ; step < maxSteps ; step ++ {
316+ // Try to advance at the current level or higher levels.
317+ advanced := false
318+ for candidateLevel := level ; candidateLevel <= maxLevel ; candidateLevel ++ {
319+ next := it .list .getNext (nd , candidateLevel )
320+ if next == it .list .tail {
321+ // Can't advance at this level, try next higher level.
322+ continue
323+ }
324+
325+ // Check if next.key >= key.
326+ offset , size := next .keyOffset , next .keySize
327+ nextKey := it .list .arena .buf [offset : offset + size ]
328+ cmp := it .list .cmp (key , nextKey )
329+
330+ if cmp <= 0 {
331+ // Found a node >= key at candidateLevel.
332+ // Descend to level 0 to find the exact first node >= key.
333+ if candidateLevel == 0 {
334+ it .nd = next
335+ return true
336+ }
337+ // Search down from nd (which is < key) to find first node >= key.
338+ it .nd = it .searchDownToLevel0 (nd , candidateLevel , key )
339+ return true
340+ }
341+
342+ // next.key < key at candidateLevel. If we're at a higher level,
343+ // this means we can skip many nodes at level 0. Advance and stay
344+ // at this higher level.
345+ nd = next
346+ level = candidateLevel
347+ advanced = true
348+ break
349+ }
350+
351+ // If we couldn't advance at any level, we're done.
352+ if ! advanced {
353+ break
354+ }
355+ }
356+
357+ // Didn't find the key within maxSteps, but update position.
358+ it .nd = nd
359+ return false
360+ }
361+
362+ // searchDownToLevel0 descends from the given node and level down to level 0,
363+ // searching for the first node >= key. It starts at 'nd' which is known to be < key,
364+ // and searches forward at progressively lower levels.
365+ func (it * Iterator ) searchDownToLevel0 (nd * node , startLevel int , key []byte ) * node {
366+ for level := startLevel - 1 ; level >= 0 ; level -- {
367+ for {
368+ next := it .list .getNext (nd , level )
369+ if next == it .list .tail {
370+ // Reached end at this level, move down.
371+ break
372+ }
373+
374+ offset , size := next .keyOffset , next .keySize
375+ nextKey := it .list .arena .buf [offset : offset + size ]
376+ cmp := it .list .cmp (key , nextKey )
377+ if cmp <= 0 {
378+ // Found a node >= key at this level. If we're at level 0, return it.
379+ if level == 0 {
380+ return next
381+ }
382+ // Otherwise, move down to continue searching.
383+ break
384+ }
385+ // next.key < key, keep advancing at this level.
386+ nd = next
387+ }
388+ }
389+ // If we get here, nd is the last node < key, so return the next node at level 0.
390+ return it .list .getNext (nd , 0 )
391+ }
392+
262393func (it * Iterator ) seekForBaseSplice (key []byte ) (prev , next * node ) {
263394 prev = it .list .head
264395 for level := int (it .list .Height () - 1 ); level >= 0 ; level -- {
0 commit comments