ceremonyclient/pebble/internal/batchskl/skl.go

/*
 * Copyright 2017 Dgraph Labs, Inc. and Contributors
 * Modifications copyright (C) 2017 Andy Kimball and Contributors
 *
 * Licensed under the Apache License, Version 2.0 (the "License")
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
Adapted from RocksDB inline skiplist.

Key differences:
- No optimization for sequential inserts (no "prev").
- No custom comparator.
- Support overwrites. This requires care when we see the same key when inserting.
  For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so
	there is no need for values. We don't intend to support versioning. In-place updates of values
	would be more efficient.
- We discard all non-concurrent code.
- We do not support Splices. This simplifies the code a lot.
- No AllocateNode or other pointer arithmetic.
- We combine the findLessThan, findGreaterOrEqual, etc into one function.
*/

/*
Further adapted from Badger: https://github.com/dgraph-io/badger.

Key differences:
- Support for previous pointers - doubly linked lists. Note that it's up to higher
  level code to deal with the intermediate state that occurs during insertion,
  where node A is linked to node B, but node B is not yet linked back to node A.
- Iterator includes mutator functions.
*/

/*
Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl

Key differences:
- Removed support for deletion.
- Removed support for concurrency.
- External storage of keys.
- Node storage grows to an arbitrary size.
*/

package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"

import (
	"bytes"
	"encoding/binary"
	"fmt"
	"math"
	"time"
	"unsafe"

	"github.com/cockroachdb/errors"
	"github.com/cockroachdb/pebble/internal/base"
	"github.com/cockroachdb/pebble/internal/constants"
	"golang.org/x/exp/rand"
)

const (
	maxHeight    = 20
	maxNodeSize  = uint64(unsafe.Sizeof(node{}))
	linksSize    = uint64(unsafe.Sizeof(links{}))
	maxNodesSize = constants.MaxUint32OrInt
)

var (
	// ErrExists indicates that a duplicate record was inserted. This should never
	// happen for normal usage of batchskl as every key should have a unique
	// sequence number.
	ErrExists = errors.New("record with this key already exists")

	// ErrTooManyRecords is a sentinel error returned when the size of the raw
	// nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This
	// corresponds to ~117 M skiplist entries.
	ErrTooManyRecords = errors.New("too many records")
)

type links struct {
	next uint32
	prev uint32
}

type node struct {
	// The offset of the start of the record in the storage.
	offset uint32
	// The offset of the start and end of the key in storage.
	keyStart uint32
	keyEnd   uint32
	// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key
	// during seek operations. The key retrieval can be expensive purely due to
	// cache misses while the abbreviatedKey stored here will be in the same
	// cache line as the key and the links making accessing and comparing against
	// it almost free.
	abbreviatedKey uint64
	// Most nodes do not need to use the full height of the link tower, since the
	// probability of each successive level decreases exponentially. Because
	// these elements are never accessed, they do not need to be allocated.
	// Therefore, when a node is allocated, its memory footprint is deliberately
	// truncated to not include unneeded link elements.
	links [maxHeight]links
}

// Skiplist is a fast, non-cocnurrent skiplist implementation that supports
// forward and backward iteration. See arenaskl.Skiplist for a concurrent
// skiplist. Keys and values are stored externally from the skiplist via the
// Storage interface. Deletion is not supported. Instead, higher-level code is
// expected to perform deletion via tombstones and needs to process those
// tombstones appropriately during retrieval operations.
type Skiplist struct {
	storage        *[]byte
	cmp            base.Compare
	abbreviatedKey base.AbbreviatedKey
	nodes          []byte
	head           uint32
	tail           uint32
	height         uint32 // Current height: 1 <= height <= maxHeight
	rand           rand.PCGSource
}

var (
	probabilities [maxHeight]uint32
)

func init() {
	const pValue = 1 / math.E

	// Precompute the skiplist probabilities so that only a single random number
	// needs to be generated and so that the optimal pvalue can be used (inverse
	// of Euler's number).
	p := float64(1.0)
	for i := 0; i < maxHeight; i++ {
		probabilities[i] = uint32(float64(math.MaxUint32) * p)
		p *= pValue
	}
}

// NewSkiplist constructs and initializes a new, empty skiplist.
func NewSkiplist(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) *Skiplist {
	s := &Skiplist{}
	s.Init(storage, cmp, abbreviatedKey)
	return s
}

// Reset the fields in the skiplist for reuse.
func (s *Skiplist) Reset() {
	*s = Skiplist{
		nodes:  s.nodes[:0],
		height: 1,
	}
	const batchMaxRetainedSize = 1 << 20 // 1 MB
	if cap(s.nodes) > batchMaxRetainedSize {
		s.nodes = nil
	}
}

// Init the skiplist to empty and re-initialize.
func (s *Skiplist) Init(storage *[]byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {
	*s = Skiplist{
		storage:        storage,
		cmp:            cmp,
		abbreviatedKey: abbreviatedKey,
		nodes:          s.nodes[:0],
		height:         1,
	}
	s.rand.Seed(uint64(time.Now().UnixNano()))

	const initBufSize = 256
	if cap(s.nodes) < initBufSize {
		s.nodes = make([]byte, 0, initBufSize)
	}

	// Allocate head and tail nodes. While allocating a new node can fail, in the
	// context of initializing the skiplist we consider it unrecoverable.
	var err error
	s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)
	if err != nil {
		panic(err)
	}
	s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)
	if err != nil {
		panic(err)
	}

	// Link all head/tail levels together.
	headNode := s.node(s.head)
	tailNode := s.node(s.tail)
	for i := uint32(0); i < maxHeight; i++ {
		headNode.links[i].next = s.tail
		tailNode.links[i].prev = s.head
	}
}

// Add adds a new key to the skiplist if it does not yet exist. If the record
// already exists, then Add returns ErrRecordExists.
func (s *Skiplist) Add(keyOffset uint32) error {
	data := (*s.storage)[keyOffset+1:]
	v, n := binary.Uvarint(data)
	if n <= 0 {
		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
	}
	data = data[n:]
	if v > uint64(len(data)) {
		return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))
	}
	keyStart := 1 + keyOffset + uint32(n)
	keyEnd := keyStart + uint32(v)
	key := data[:v]
	abbreviatedKey := s.abbreviatedKey(key)

	// spl holds the list of next and previous links for each level in the
	// skiplist indicating where the new node will be inserted.
	var spl [maxHeight]splice

	// Fast-path for in-order insertion of keys: compare the new key against the
	// last key.
	prev := s.getPrev(s.tail, 0)
	if prevNode := s.node(prev); prev == s.head ||
		abbreviatedKey > prevNode.abbreviatedKey ||
		(abbreviatedKey == prevNode.abbreviatedKey &&
			s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {
		for level := uint32(0); level < s.height; level++ {
			spl[level].prev = s.getPrev(s.tail, level)
			spl[level].next = s.tail
		}
	} else {
		s.findSplice(key, abbreviatedKey, &spl)
	}

	height := s.randomHeight()
	// Increase s.height as necessary.
	for ; s.height < height; s.height++ {
		spl[s.height].next = s.tail
		spl[s.height].prev = s.head
	}

	// We always insert from the base level and up. After you add a node in base
	// level, we cannot create a node in the level above because it would have
	// discovered the node in the base level.
	nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)
	if err != nil {
		return err
	}
	newNode := s.node(nd)
	for level := uint32(0); level < height; level++ {
		next := spl[level].next
		prev := spl[level].prev
		newNode.links[level].next = next
		newNode.links[level].prev = prev
		s.node(next).links[level].prev = nd
		s.node(prev).links[level].next = nd
	}

	return nil
}

// NewIter returns a new Iterator object. The lower and upper bound parameters
// control the range of keys the iterator will return. Specifying for nil for
// lower or upper bound disables the check for that boundary. Note that lower
// bound is not checked on {SeekGE,First} and upper bound is not check on
// {SeekLT,Last}. The user is expected to perform that check. Note that it is
// safe for an iterator to be copied by value.
func (s *Skiplist) NewIter(lower, upper []byte) Iterator {
	return Iterator{list: s, lower: lower, upper: upper}
}

func (s *Skiplist) newNode(
	height,
	offset, keyStart, keyEnd uint32, abbreviatedKey uint64,
) (uint32, error) {
	if height < 1 || height > maxHeight {
		panic("height cannot be less than one or greater than the max height")
	}

	unusedSize := uint64(maxHeight-int(height)) * linksSize
	nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))
	if err != nil {
		return 0, err
	}
	nd := s.node(nodeOffset)

	nd.offset = offset
	nd.keyStart = keyStart
	nd.keyEnd = keyEnd
	nd.abbreviatedKey = abbreviatedKey
	return nodeOffset, nil
}

func (s *Skiplist) alloc(size uint32) (uint32, error) {
	offset := uint64(len(s.nodes))

	// We only have a need for memory up to offset + size, but we never want
	// to allocate a node whose tail points into unallocated memory.
	minAllocSize := offset + maxNodeSize
	if uint64(cap(s.nodes)) < minAllocSize {
		allocSize := uint64(cap(s.nodes)) * 2
		if allocSize < minAllocSize {
			allocSize = minAllocSize
		}
		// Cap the allocation at the max allowed size to avoid wasted capacity.
		if allocSize > maxNodesSize {
			// The new record may still not fit within the allocation, in which case
			// we return early with an error. This avoids the panic below when we
			// resize the slice. It also avoids the allocation and copy.
			if uint64(offset)+uint64(size) > maxNodesSize {
				return 0, errors.Wrapf(ErrTooManyRecords,
					"alloc of new record (size=%d) would overflow uint32 (current size=%d)",
					uint64(offset)+uint64(size), offset,
				)
			}
			allocSize = maxNodesSize
		}
		tmp := make([]byte, len(s.nodes), allocSize)
		copy(tmp, s.nodes)
		s.nodes = tmp
	}

	newSize := uint32(offset) + size
	s.nodes = s.nodes[:newSize]
	return uint32(offset), nil
}

func (s *Skiplist) node(offset uint32) *node {
	return (*node)(unsafe.Pointer(&s.nodes[offset]))
}

func (s *Skiplist) randomHeight() uint32 {
	rnd := uint32(s.rand.Uint64())
	h := uint32(1)
	for h < maxHeight && rnd <= probabilities[h] {
		h++
	}
	return h
}

func (s *Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl *[maxHeight]splice) {
	prev := s.head

	for level := s.height - 1; ; level-- {
		// The code in this loop is the same as findSpliceForLevel(). For some
		// reason, calling findSpliceForLevel() here is much much slower than the
		// inlined code below. The excess time is also caught up in the final
		// return statement which makes little sense. Revisit when in go1.14 or
		// later if inlining improves.

		next := s.getNext(prev, level)
		for next != s.tail {
			// Assume prev.key < key.
			nextNode := s.node(next)
			nextAbbreviatedKey := nextNode.abbreviatedKey
			if abbreviatedKey < nextAbbreviatedKey {
				// We are done for this level, since prev.key < key < next.key.
				break
			}
			if abbreviatedKey == nextAbbreviatedKey {
				if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
					// We are done for this level, since prev.key < key <= next.key.
					break
				}
			}

			// Keep moving right on this level.
			prev = next
			next = nextNode.links[level].next
		}

		spl[level].prev = prev
		spl[level].next = next
		if level == 0 {
			break
		}
	}
}

func (s *Skiplist) findSpliceForLevel(
	key []byte, abbreviatedKey uint64, level, start uint32,
) (prev, next uint32) {
	prev = start
	next = s.getNext(prev, level)

	for next != s.tail {
		// Assume prev.key < key.
		nextNode := s.node(next)
		nextAbbreviatedKey := nextNode.abbreviatedKey
		if abbreviatedKey < nextAbbreviatedKey {
			// We are done for this level, since prev.key < key < next.key.
			break
		}
		if abbreviatedKey == nextAbbreviatedKey {
			if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {
				// We are done for this level, since prev.key < key < next.key.
				break
			}
		}

		// Keep moving right on this level.
		prev = next
		next = nextNode.links[level].next
	}

	return
}

func (s *Skiplist) getKey(nd uint32) base.InternalKey {
	n := s.node(nd)
	kind := base.InternalKeyKind((*s.storage)[n.offset])
	key := (*s.storage)[n.keyStart:n.keyEnd]
	return base.MakeInternalKey(key, uint64(n.offset)|base.InternalKeySeqNumBatch, kind)
}

func (s *Skiplist) getNext(nd, h uint32) uint32 {
	return s.node(nd).links[h].next
}

func (s *Skiplist) getPrev(nd, h uint32) uint32 {
	return s.node(nd).links[h].prev
}

func (s *Skiplist) debug() string {
	var buf bytes.Buffer
	for level := uint32(0); level < s.height; level++ {
		var count int
		for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {
			count++
		}
		fmt.Fprintf(&buf, "%d: %d\n", level, count)
	}
	return buf.String()
}

// Silence unused warning.
var _ = (*Skiplist).debug
v1.2.0 (#31) 2024-01-03 07:31:42 +00:00			`/*`
			`* Copyright 2017 Dgraph Labs, Inc. and Contributors`
			`* Modifications copyright (C) 2017 Andy Kimball and Contributors`
			`*`
			`* Licensed under the Apache License, Version 2.0 (the "License")`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*/`

			`/*`
			`Adapted from RocksDB inline skiplist.`

			`Key differences:`
			`- No optimization for sequential inserts (no "prev").`
			`- No custom comparator.`
			`- Support overwrites. This requires care when we see the same key when inserting.`
			`For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so`
			`there is no need for values. We don't intend to support versioning. In-place updates of values`
			`would be more efficient.`
			`- We discard all non-concurrent code.`
			`- We do not support Splices. This simplifies the code a lot.`
			`- No AllocateNode or other pointer arithmetic.`
			`- We combine the findLessThan, findGreaterOrEqual, etc into one function.`
			`*/`

			`/*`
			`Further adapted from Badger: https://github.com/dgraph-io/badger.`

			`Key differences:`
			`- Support for previous pointers - doubly linked lists. Note that it's up to higher`
			`level code to deal with the intermediate state that occurs during insertion,`
			`where node A is linked to node B, but node B is not yet linked back to node A.`
			`- Iterator includes mutator functions.`
			`*/`

			`/*`
			`Further adapted from arenaskl: https://github.com/andy-kimball/arenaskl`

			`Key differences:`
			`- Removed support for deletion.`
			`- Removed support for concurrency.`
			`- External storage of keys.`
			`- Node storage grows to an arbitrary size.`
			`*/`

			`package batchskl // import "github.com/cockroachdb/pebble/internal/batchskl"`

			`import (`
			`"bytes"`
			`"encoding/binary"`
			`"fmt"`
			`"math"`
			`"time"`
			`"unsafe"`

			`"github.com/cockroachdb/errors"`
			`"github.com/cockroachdb/pebble/internal/base"`
			`"github.com/cockroachdb/pebble/internal/constants"`
			`"golang.org/x/exp/rand"`
			`)`

			`const (`
			`maxHeight = 20`
			`maxNodeSize = uint64(unsafe.Sizeof(node{}))`
			`linksSize = uint64(unsafe.Sizeof(links{}))`
			`maxNodesSize = constants.MaxUint32OrInt`
			`)`

			`var (`
			`// ErrExists indicates that a duplicate record was inserted. This should never`
			`// happen for normal usage of batchskl as every key should have a unique`
			`// sequence number.`
			`ErrExists = errors.New("record with this key already exists")`

			`// ErrTooManyRecords is a sentinel error returned when the size of the raw`
			`// nodes slice exceeds the maximum allowed size (currently 1 << 32 - 1). This`
			`// corresponds to ~117 M skiplist entries.`
			`ErrTooManyRecords = errors.New("too many records")`
			`)`

			`type links struct {`
			`next uint32`
			`prev uint32`
			`}`

			`type node struct {`
			`// The offset of the start of the record in the storage.`
			`offset uint32`
			`// The offset of the start and end of the key in storage.`
			`keyStart uint32`
			`keyEnd uint32`
			`// A fixed 8-byte abbreviation of the key, used to avoid retrieval of the key`
			`// during seek operations. The key retrieval can be expensive purely due to`
			`// cache misses while the abbreviatedKey stored here will be in the same`
			`// cache line as the key and the links making accessing and comparing against`
			`// it almost free.`
			`abbreviatedKey uint64`
			`// Most nodes do not need to use the full height of the link tower, since the`
			`// probability of each successive level decreases exponentially. Because`
			`// these elements are never accessed, they do not need to be allocated.`
			`// Therefore, when a node is allocated, its memory footprint is deliberately`
			`// truncated to not include unneeded link elements.`
			`links [maxHeight]links`
			`}`

			`// Skiplist is a fast, non-cocnurrent skiplist implementation that supports`
			`// forward and backward iteration. See arenaskl.Skiplist for a concurrent`
			`// skiplist. Keys and values are stored externally from the skiplist via the`
			`// Storage interface. Deletion is not supported. Instead, higher-level code is`
			`// expected to perform deletion via tombstones and needs to process those`
			`// tombstones appropriately during retrieval operations.`
			`type Skiplist struct {`
			`storage *[]byte`
			`cmp base.Compare`
			`abbreviatedKey base.AbbreviatedKey`
			`nodes []byte`
			`head uint32`
			`tail uint32`
			`height uint32 // Current height: 1 <= height <= maxHeight`
			`rand rand.PCGSource`
			`}`

			`var (`
			`probabilities [maxHeight]uint32`
			`)`

			`func init() {`
			`const pValue = 1 / math.E`

			`// Precompute the skiplist probabilities so that only a single random number`
			`// needs to be generated and so that the optimal pvalue can be used (inverse`
			`// of Euler's number).`
			`p := float64(1.0)`
			`for i := 0; i < maxHeight; i++ {`
			`probabilities[i] = uint32(float64(math.MaxUint32) * p)`
			`p *= pValue`
			`}`
			`}`

			`// NewSkiplist constructs and initializes a new, empty skiplist.`
			`func NewSkiplist(storage []byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) Skiplist {`
			`s := &Skiplist{}`
			`s.Init(storage, cmp, abbreviatedKey)`
			`return s`
			`}`

			`// Reset the fields in the skiplist for reuse.`
			`func (s *Skiplist) Reset() {`
			`*s = Skiplist{`
			`nodes: s.nodes[:0],`
			`height: 1,`
			`}`
			`const batchMaxRetainedSize = 1 << 20 // 1 MB`
			`if cap(s.nodes) > batchMaxRetainedSize {`
			`s.nodes = nil`
			`}`
			`}`

			`// Init the skiplist to empty and re-initialize.`
			`func (s Skiplist) Init(storage []byte, cmp base.Compare, abbreviatedKey base.AbbreviatedKey) {`
			`*s = Skiplist{`
			`storage: storage,`
			`cmp: cmp,`
			`abbreviatedKey: abbreviatedKey,`
			`nodes: s.nodes[:0],`
			`height: 1,`
			`}`
			`s.rand.Seed(uint64(time.Now().UnixNano()))`

			`const initBufSize = 256`
			`if cap(s.nodes) < initBufSize {`
			`s.nodes = make([]byte, 0, initBufSize)`
			`}`

			`// Allocate head and tail nodes. While allocating a new node can fail, in the`
			`// context of initializing the skiplist we consider it unrecoverable.`
			`var err error`
			`s.head, err = s.newNode(maxHeight, 0, 0, 0, 0)`
			`if err != nil {`
			`panic(err)`
			`}`
			`s.tail, err = s.newNode(maxHeight, 0, 0, 0, 0)`
			`if err != nil {`
			`panic(err)`
			`}`

			`// Link all head/tail levels together.`
			`headNode := s.node(s.head)`
			`tailNode := s.node(s.tail)`
			`for i := uint32(0); i < maxHeight; i++ {`
			`headNode.links[i].next = s.tail`
			`tailNode.links[i].prev = s.head`
			`}`
			`}`

			`// Add adds a new key to the skiplist if it does not yet exist. If the record`
			`// already exists, then Add returns ErrRecordExists.`
			`func (s *Skiplist) Add(keyOffset uint32) error {`
			`data := (*s.storage)[keyOffset+1:]`
			`v, n := binary.Uvarint(data)`
			`if n <= 0 {`
			`return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))`
			`}`
			`data = data[n:]`
			`if v > uint64(len(data)) {`
			`return errors.Errorf("corrupted batch entry: %d", errors.Safe(keyOffset))`
			`}`
			`keyStart := 1 + keyOffset + uint32(n)`
			`keyEnd := keyStart + uint32(v)`
			`key := data[:v]`
			`abbreviatedKey := s.abbreviatedKey(key)`

			`// spl holds the list of next and previous links for each level in the`
			`// skiplist indicating where the new node will be inserted.`
			`var spl [maxHeight]splice`

			`// Fast-path for in-order insertion of keys: compare the new key against the`
			`// last key.`
			`prev := s.getPrev(s.tail, 0)`
			`if prevNode := s.node(prev); prev == s.head \|\|`
			`abbreviatedKey > prevNode.abbreviatedKey \|\|`
			`(abbreviatedKey == prevNode.abbreviatedKey &&`
			`s.cmp(key, (*s.storage)[prevNode.keyStart:prevNode.keyEnd]) > 0) {`
			`for level := uint32(0); level < s.height; level++ {`
			`spl[level].prev = s.getPrev(s.tail, level)`
			`spl[level].next = s.tail`
			`}`
			`} else {`
			`s.findSplice(key, abbreviatedKey, &spl)`
			`}`

			`height := s.randomHeight()`
			`// Increase s.height as necessary.`
			`for ; s.height < height; s.height++ {`
			`spl[s.height].next = s.tail`
			`spl[s.height].prev = s.head`
			`}`

			`// We always insert from the base level and up. After you add a node in base`
			`// level, we cannot create a node in the level above because it would have`
			`// discovered the node in the base level.`
			`nd, err := s.newNode(height, keyOffset, keyStart, keyEnd, abbreviatedKey)`
			`if err != nil {`
			`return err`
			`}`
			`newNode := s.node(nd)`
			`for level := uint32(0); level < height; level++ {`
			`next := spl[level].next`
			`prev := spl[level].prev`
			`newNode.links[level].next = next`
			`newNode.links[level].prev = prev`
			`s.node(next).links[level].prev = nd`
			`s.node(prev).links[level].next = nd`
			`}`

			`return nil`
			`}`

			`// NewIter returns a new Iterator object. The lower and upper bound parameters`
			`// control the range of keys the iterator will return. Specifying for nil for`
			`// lower or upper bound disables the check for that boundary. Note that lower`
			`// bound is not checked on {SeekGE,First} and upper bound is not check on`
			`// {SeekLT,Last}. The user is expected to perform that check. Note that it is`
			`// safe for an iterator to be copied by value.`
			`func (s *Skiplist) NewIter(lower, upper []byte) Iterator {`
			`return Iterator{list: s, lower: lower, upper: upper}`
			`}`

			`func (s *Skiplist) newNode(`
			`height,`
			`offset, keyStart, keyEnd uint32, abbreviatedKey uint64,`
			`) (uint32, error) {`
			`if height < 1 \|\| height > maxHeight {`
			`panic("height cannot be less than one or greater than the max height")`
			`}`

			`unusedSize := uint64(maxHeight-int(height)) * linksSize`
			`nodeOffset, err := s.alloc(uint32(maxNodeSize - unusedSize))`
			`if err != nil {`
			`return 0, err`
			`}`
			`nd := s.node(nodeOffset)`

			`nd.offset = offset`
			`nd.keyStart = keyStart`
			`nd.keyEnd = keyEnd`
			`nd.abbreviatedKey = abbreviatedKey`
			`return nodeOffset, nil`
			`}`

			`func (s *Skiplist) alloc(size uint32) (uint32, error) {`
			`offset := uint64(len(s.nodes))`

			`// We only have a need for memory up to offset + size, but we never want`
			`// to allocate a node whose tail points into unallocated memory.`
			`minAllocSize := offset + maxNodeSize`
			`if uint64(cap(s.nodes)) < minAllocSize {`
			`allocSize := uint64(cap(s.nodes)) * 2`
			`if allocSize < minAllocSize {`
			`allocSize = minAllocSize`
			`}`
			`// Cap the allocation at the max allowed size to avoid wasted capacity.`
			`if allocSize > maxNodesSize {`
			`// The new record may still not fit within the allocation, in which case`
			`// we return early with an error. This avoids the panic below when we`
			`// resize the slice. It also avoids the allocation and copy.`
			`if uint64(offset)+uint64(size) > maxNodesSize {`
			`return 0, errors.Wrapf(ErrTooManyRecords,`
			`"alloc of new record (size=%d) would overflow uint32 (current size=%d)",`
			`uint64(offset)+uint64(size), offset,`
			`)`
			`}`
			`allocSize = maxNodesSize`
			`}`
			`tmp := make([]byte, len(s.nodes), allocSize)`
			`copy(tmp, s.nodes)`
			`s.nodes = tmp`
			`}`

			`newSize := uint32(offset) + size`
			`s.nodes = s.nodes[:newSize]`
			`return uint32(offset), nil`
			`}`

			`func (s Skiplist) node(offset uint32) node {`
			`return (*node)(unsafe.Pointer(&s.nodes[offset]))`
			`}`

			`func (s *Skiplist) randomHeight() uint32 {`
			`rnd := uint32(s.rand.Uint64())`
			`h := uint32(1)`
			`for h < maxHeight && rnd <= probabilities[h] {`
			`h++`
			`}`
			`return h`
			`}`

			`func (s Skiplist) findSplice(key []byte, abbreviatedKey uint64, spl [maxHeight]splice) {`
			`prev := s.head`

			`for level := s.height - 1; ; level-- {`
			`// The code in this loop is the same as findSpliceForLevel(). For some`
			`// reason, calling findSpliceForLevel() here is much much slower than the`
			`// inlined code below. The excess time is also caught up in the final`
			`// return statement which makes little sense. Revisit when in go1.14 or`
			`// later if inlining improves.`

			`next := s.getNext(prev, level)`
			`for next != s.tail {`
			`// Assume prev.key < key.`
			`nextNode := s.node(next)`
			`nextAbbreviatedKey := nextNode.abbreviatedKey`
			`if abbreviatedKey < nextAbbreviatedKey {`
			`// We are done for this level, since prev.key < key < next.key.`
			`break`
			`}`
			`if abbreviatedKey == nextAbbreviatedKey {`
			`if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {`
			`// We are done for this level, since prev.key < key <= next.key.`
			`break`
			`}`
			`}`

			`// Keep moving right on this level.`
			`prev = next`
			`next = nextNode.links[level].next`
			`}`

			`spl[level].prev = prev`
			`spl[level].next = next`
			`if level == 0 {`
			`break`
			`}`
			`}`
			`}`

			`func (s *Skiplist) findSpliceForLevel(`
			`key []byte, abbreviatedKey uint64, level, start uint32,`
			`) (prev, next uint32) {`
			`prev = start`
			`next = s.getNext(prev, level)`

			`for next != s.tail {`
			`// Assume prev.key < key.`
			`nextNode := s.node(next)`
			`nextAbbreviatedKey := nextNode.abbreviatedKey`
			`if abbreviatedKey < nextAbbreviatedKey {`
			`// We are done for this level, since prev.key < key < next.key.`
			`break`
			`}`
			`if abbreviatedKey == nextAbbreviatedKey {`
			`if s.cmp(key, (*s.storage)[nextNode.keyStart:nextNode.keyEnd]) <= 0 {`
			`// We are done for this level, since prev.key < key < next.key.`
			`break`
			`}`
			`}`

			`// Keep moving right on this level.`
			`prev = next`
			`next = nextNode.links[level].next`
			`}`

			`return`
			`}`

			`func (s *Skiplist) getKey(nd uint32) base.InternalKey {`
			`n := s.node(nd)`
			`kind := base.InternalKeyKind((*s.storage)[n.offset])`
			`key := (*s.storage)[n.keyStart:n.keyEnd]`
			`return base.MakeInternalKey(key, uint64(n.offset)\|base.InternalKeySeqNumBatch, kind)`
			`}`

			`func (s *Skiplist) getNext(nd, h uint32) uint32 {`
			`return s.node(nd).links[h].next`
			`}`

			`func (s *Skiplist) getPrev(nd, h uint32) uint32 {`
			`return s.node(nd).links[h].prev`
			`}`

			`func (s *Skiplist) debug() string {`
			`var buf bytes.Buffer`
			`for level := uint32(0); level < s.height; level++ {`
			`var count int`
			`for nd := s.head; nd != s.tail; nd = s.getNext(nd, level) {`
			`count++`
			`}`
			`fmt.Fprintf(&buf, "%d: %d\n", level, count)`
			`}`
			`return buf.String()`
			`}`

			`// Silence unused warning.`
			`var _ = (*Skiplist).debug`